trigger ci on A10

2025-11-02 18:54:35 +08:00 · 2025-06-20 14:01:51 +02:00 · 2025-06-20 12:59:58 +02:00
79 changed files with 373 additions and 350 deletions
--- a/.github/workflows/model_jobs.yml
+++ b/.github/workflows/model_jobs.yml
@ -107,9 +107,9 @@ jobs:
        run: |
          echo "${{ inputs.machine_type }}"

-          if [ "${{ inputs.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ inputs.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ inputs.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
+          elif [ "${{ inputs.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ inputs.machine_type }}
--- a/.github/workflows/self-scheduled-caller.yml
+++ b/.github/workflows/self-scheduled-caller.yml
@ -7,7 +7,7 @@ on:
    - cron: "17 2 * * *"
  push:
    branches:
-      - run_scheduled_ci*
+      - trigger_ci_on_a10
  workflow_dispatch:
    inputs:
      prev_workflow_run_id:
@ -25,7 +25,7 @@ on:
 # Used for `push` to easily modiffy the target workflow runs to compare against
 env:
    prev_workflow_run_id: ""
-    other_workflow_run_id: ""
+    other_workflow_run_id: "15770139098"


 jobs:
@ -50,70 +50,9 @@ jobs:
    uses: ./.github/workflows/self-scheduled.yml
    with:
      job: run_models_gpu
-      slack_report_channel: "#transformers-ci-daily-models"
+      slack_report_channel: "#transformers-ci-dummy"
      runner: daily-ci
      docker: huggingface/transformers-all-latest-gpu
      ci_event: Daily CI
      report_repo_id: hf-internal-testing/transformers_daily_ci
    secrets: inherit
-
-  torch-pipeline:
-    name: Torch pipeline CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_pipelines_torch_gpu
-      slack_report_channel: "#transformers-ci-daily-pipeline-torch"
-      runner: daily-ci
-      docker: huggingface/transformers-pytorch-gpu
-      ci_event: Daily CI
-      report_repo_id: hf-internal-testing/transformers_daily_ci
-    secrets: inherit
-
-  example-ci:
-    name: Example CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_examples_gpu
-      slack_report_channel: "#transformers-ci-daily-examples"
-      runner: daily-ci
-      docker: huggingface/transformers-all-latest-gpu
-      ci_event: Daily CI
-      report_repo_id: hf-internal-testing/transformers_daily_ci
-    secrets: inherit
-
-  trainer-fsdp-ci:
-    name: Trainer/FSDP CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_trainer_and_fsdp_gpu
-      slack_report_channel: "#transformers-ci-daily-training"
-      runner: daily-ci
-      docker: huggingface/transformers-all-latest-gpu
-      ci_event: Daily CI
-      report_repo_id: hf-internal-testing/transformers_daily_ci
-    secrets: inherit
-
-  deepspeed-ci:
-    name: DeepSpeed CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_torch_cuda_extensions_gpu
-      slack_report_channel: "#transformers-ci-daily-training"
-      runner: daily-ci
-      docker: huggingface/transformers-pytorch-deepspeed-latest-gpu
-      ci_event: Daily CI
-      working-directory-prefix: /workspace
-      report_repo_id: hf-internal-testing/transformers_daily_ci
-    secrets: inherit
-
-  quantization-ci:
-    name: Quantization CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_quantization_torch_gpu
-      slack_report_channel: "#transformers-ci-daily-quantization"
-      runner: daily-ci
-      docker: huggingface/transformers-quantization-latest-gpu
-      ci_event: Daily CI
-      report_repo_id: hf-internal-testing/transformers_daily_ci
-    secrets: inherit
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -53,7 +53,7 @@ jobs:
    name: Setup
    strategy:
      matrix:
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -111,7 +111,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
        slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
    uses: ./.github/workflows/model_jobs.yml
    with:
@ -129,7 +129,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
        slice_id: [0, 1]
    uses: ./.github/workflows/model_jobs.yml
    with:
@ -147,7 +147,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -181,9 +181,9 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
@ -215,7 +215,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-4xlarge-cache]
+        machine_type: [aws-g5-4xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -249,9 +249,9 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
@ -284,7 +284,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -346,9 +346,9 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
@ -383,7 +383,7 @@ jobs:
      fail-fast: false
      matrix:
        folders: ${{ fromJson(needs.setup.outputs.quantization_matrix) }}
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -426,9 +426,9 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
--- a/docker/transformers-pytorch-xpu/Dockerfile
+++ b/docker/transformers-pytorch-xpu/Dockerfile
@ -1,93 +0,0 @@
-FROM intel/deep-learning-essentials:2025.1.3-0-devel-ubuntu22.04 AS base
-LABEL maintainer="Hugging Face"
-
-SHELL ["/bin/bash", "-c"]
-
-ARG PYTHON_VER=3.11
-ENV TORCH_DEVICE_BACKEND_AUTOLOAD=0
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN apt-get remove -y python3.10 && apt-get autoremove -y
-RUN apt-get update && \
-    apt-get install -y software-properties-common && \
-    add-apt-repository -y ppa:deadsnakes/ppa && \
-    apt-get update && \
-    apt-get install -y python$PYTHON_VER python$PYTHON_VER-dev python3-pip && \
-    ln -sf /usr/bin/python$PYTHON_VER /usr/bin/python3 && \
-    ln -sf /usr/bin/python3 /usr/bin/python && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN apt-get update && \
-    apt-get -y install \
-        apt-utils \
-        build-essential \
-        ca-certificates \
-        clinfo \
-        curl \
-        git \
-        git-lfs \
-        vim \
-        numactl \
-        gnupg2 \
-        gpg-agent \
-        zlib1g-dev \
-        rsync \
-        sudo \
-        libnl-genl-3-200 \
-        xpu-smi \
-        unzip \
-        ffmpeg \
-        tesseract-ocr \
-        espeak-ng \
-        wget \
-        ncurses-term && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-
-RUN apt-get update && \
-    apt-get install -y \
-        linux-headers-$(uname -r) \
-        linux-modules-extra-$(uname -r) \
-        flex bison \
-        intel-fw-gpu intel-i915-dkms xpu-smi \
-        intel-opencl-icd libze-intel-gpu1 libze1 \
-        intel-media-va-driver-non-free libmfx-gen1 libvpl2 \
-        libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
-        libglapi-mesa libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
-        mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo intel-ocloc \
-        libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev && \
-    apt-get clean && \
-    rm -rf  /var/lib/apt/lists/*
-
-RUN pip install --upgrade pip
-RUN pip install triton==3.3.0
-
-RUN pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/xpu --no-cache-dir
-
-RUN pip install evaluate torchdata pyctcdecode pytesseract decord galore-torch fire scipy scikit-learn sentencepiece sacremoses nltk rouge_score librosa soundfile g2p_en mpi4py requests_mock
-RUN pip install pretty_midi essentia resampy Levenshtein av sacrebleu phonemizer invisible_watermark schedulefree
-RUN pip install gguf hqq compressed_tensors gptqmodel mergekit autoawq deepspeed torchao onnx
-RUN pip install hf_transfer huggingface-hub hf-doc-builder datasets optimum-quanto timm transformers accelerate optimum peft
-
-RUN pip install git+https://github.com/linkedin/Liger-Kernel.git --extra-index-url https://download.pytorch.org/whl/test/xpu
-
-# install bitsandbytes
-RUN pip install git+https://github.com/bitsandbytes-foundation/bitsandbytes.git
-
-ENV OCL_ICD_VENDORS=/etc/OpenCL/vendors
-ENV FI_PROVIDER_PATH=${I_MPI_ROOT}/lib/libfabric/prov:/usr/lib/x86_64-linux-gnu/libfabric
-ENV CCL_ROOT=/usr/local
-ENV CCL_ATL_TRANSPORT=ofi
-ENV I_MPI_ROOT=/usr/local
-ENV CLASSPATH=${I_MPI_ROOT}/lib/mpi.jar
-ENV PATH=${I_MPI_ROOT}/bin/libfabric:${PATH}
-ENV LD_LIBRARY_PATH=${I_MPI_ROOT}/lib/libfabric:${LD_LIBRARY_PATH}
-
-RUN touch /entrypoint.sh
-RUN chmod +x /entrypoint.sh
-RUN echo "#!/bin/bash" >> /entrypoint.sh
-RUN echo "source /opt/intel/oneapi/setvars.sh --force && /bin/bash" >> /entrypoint.sh
-
-ENTRYPOINT ["/entrypoint.sh"]
--- a/docs/source/en/model_doc/seamless_m4t.md
+++ b/docs/source/en/model_doc/seamless_m4t.md
@ -56,7 +56,7 @@ Here is how to use the processor to process text and audio:
 ```python
 >>> # let's load an audio sample from an Arabic speech corpus
 >>> from datasets import load_dataset
->>> dataset = load_dataset("halabi2016/arabic_speech_corpus", split="test", streaming=True)
+>>> dataset = load_dataset("arabic_speech_corpus", split="test", streaming=True, trust_remote_code=True)
 >>> audio_sample = next(iter(dataset))["audio"]

 >>> # now, process it
--- a/docs/source/en/model_doc/seamless_m4t_v2.md
+++ b/docs/source/en/model_doc/seamless_m4t_v2.md
@ -56,7 +56,7 @@ Here is how to use the processor to process text and audio:
 ```python
 >>> # let's load an audio sample from an Arabic speech corpus
 >>> from datasets import load_dataset
->>> dataset = load_dataset("halabi2016/arabic_speech_corpus", split="test", streaming=True)
+>>> dataset = load_dataset("arabic_speech_corpus", split="test", streaming=True, trust_remote_code=True)
 >>> audio_sample = next(iter(dataset))["audio"]

 >>> # now, process it
--- a/examples/flax/test_flax_examples.py
+++ b/examples/flax/test_flax_examples.py
@ -264,6 +264,7 @@ class ExamplesTests(TestCasePlus):
            --dataset_config clean
            --train_split_name validation
            --eval_split_name validation
+            --trust_remote_code
            --output_dir {tmp_dir}
            --overwrite_output_dir
            --num_train_epochs=2
--- a/examples/pytorch/test_accelerate_examples.py
+++ b/examples/pytorch/test_accelerate_examples.py
@ -312,6 +312,7 @@ class ExamplesTestsNoTrainer(TestCasePlus):
            {self.examples_dir}/pytorch/image-classification/run_image_classification_no_trainer.py
            --model_name_or_path google/vit-base-patch16-224-in21k
            --dataset_name hf-internal-testing/cats_vs_dogs_sample
+            --trust_remote_code
            --learning_rate 1e-4
            --per_device_train_batch_size 2
            --per_device_eval_batch_size 1
--- a/examples/pytorch/test_pytorch_examples.py
+++ b/examples/pytorch/test_pytorch_examples.py
@ -390,6 +390,7 @@ class ExamplesTests(TestCasePlus):
            --output_dir {tmp_dir}
            --model_name_or_path google/vit-base-patch16-224-in21k
            --dataset_name hf-internal-testing/cats_vs_dogs_sample
+            --trust_remote_code
            --do_train
            --do_eval
            --learning_rate 1e-4
@ -423,6 +424,7 @@ class ExamplesTests(TestCasePlus):
            --dataset_config_name clean
            --train_split_name validation
            --eval_split_name validation
+            --trust_remote_code
            --do_train
            --do_eval
            --learning_rate 1e-4
@ -453,6 +455,7 @@ class ExamplesTests(TestCasePlus):
            --dataset_config_name clean
            --train_split_name validation
            --eval_split_name validation
+            --trust_remote_code
            --do_train
            --do_eval
            --learning_rate 1e-4
@ -485,6 +488,7 @@ class ExamplesTests(TestCasePlus):
            --dataset_config_name clean
            --train_split_name validation
            --eval_split_name validation
+            --trust_remote_code
            --do_train
            --do_eval
            --learning_rate 1e-4
@ -512,6 +516,7 @@ class ExamplesTests(TestCasePlus):
            --output_dir {tmp_dir}
            --model_name_or_path hf-internal-testing/tiny-random-wav2vec2
            --dataset_name anton-l/superb_demo
+            --trust_remote_code
            --dataset_config_name ks
            --train_split_name test
            --eval_split_name test
@ -546,6 +551,7 @@ class ExamplesTests(TestCasePlus):
            --dataset_name hf-internal-testing/librispeech_asr_dummy
            --dataset_config_names clean
            --dataset_split_names validation
+            --trust_remote_code
            --learning_rate 1e-4
            --per_device_train_batch_size 4
            --per_device_eval_batch_size 4
@ -566,6 +572,7 @@ class ExamplesTests(TestCasePlus):
            run_mae.py
            --output_dir {tmp_dir}
            --dataset_name hf-internal-testing/cats_vs_dogs_sample
+            --trust_remote_code
            --do_train
            --do_eval
            --learning_rate 1e-4
--- a/examples/tensorflow/test_tensorflow_examples.py
+++ b/examples/tensorflow/test_tensorflow_examples.py
@ -315,6 +315,7 @@ class ExamplesTests(TestCasePlus):
        testargs = f"""
            run_image_classification.py
            --dataset_name hf-internal-testing/cats_vs_dogs_sample
+            --trust_remote_code
            --model_name_or_path microsoft/resnet-18
            --do_train
            --do_eval
--- a/src/transformers/integrations/tensor_parallel.py
+++ b/src/transformers/integrations/tensor_parallel.py
@ -27,6 +27,8 @@ from ..utils import is_torch_greater_or_equal, logging
 from ..utils.generic import GeneralInterface


+ALL_LAYERNORM_LAYERS = [nn.LayerNorm]
+
 logger = logging.get_logger(__name__)

 # Cache this result has it's a C FFI call which can be pretty time-consuming
--- a/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
+++ b/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
@ -206,7 +206,7 @@ def convert_audio_spectrogram_transformer_checkpoint(model_name, pytorch_dump_fo

    if "speech-commands" in model_name:
        # TODO: Convert dataset to Parquet
-        dataset = load_dataset("google/speech_commands", "v0.02", split="validation")
+        dataset = load_dataset("google/speech_commands", "v0.02", split="validation", trust_remote_code=True)
        waveform = dataset[0]["audio"]["array"]
    else:
        filepath = hf_hub_download(
--- a/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py
+++ b/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py
@ -266,7 +266,7 @@ def convert_beit_checkpoint(checkpoint_url, pytorch_dump_folder_path):
    # Check outputs on an image
    if is_semantic:
        image_processor = BeitImageProcessor(size=config.image_size, do_center_crop=False)
-        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
        image = Image.open(ds[0]["file"])
    else:
        image_processor = BeitImageProcessor(
--- a/src/transformers/models/chameleon/modeling_chameleon.py
+++ b/src/transformers/models/chameleon/modeling_chameleon.py
@ -30,6 +30,7 @@ from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
+from ...pytorch_utils import ALL_LAYERNORM_LAYERS
 from ...utils import (
    LossKwargs,
    auto_docstring,
@ -71,6 +72,9 @@ class ChameleonRMSNorm(nn.Module):
        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"


+ALL_LAYERNORM_LAYERS.append(ChameleonRMSNorm)
+
+
 # copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Chameleon
 # TODO(joao): add me back asap :)
 class ChameleonRotaryEmbedding(nn.Module):
--- a/src/transformers/models/cohere/modular_cohere.py
+++ b/src/transformers/models/cohere/modular_cohere.py
@ -35,6 +35,7 @@ from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from ...modeling_rope_utils import dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
 from ...processing_utils import Unpack
+from ...pytorch_utils import ALL_LAYERNORM_LAYERS
 from ...utils import LossKwargs, logging
 from ..llama.modeling_llama import (
    LlamaAttention,
@ -68,6 +69,9 @@ class CohereLayerNorm(nn.Module):
        return hidden_states.to(input_dtype)


+ALL_LAYERNORM_LAYERS.append(CohereLayerNorm)
+
+
 class CohereRotaryEmbedding(LlamaRotaryEmbedding):
    @torch.no_grad()
    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
--- a/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
@ -226,7 +226,7 @@ def convert_wav2vec2_checkpoint(

    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-lv60")

-    ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+    ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
    input_audio = [x["array"] for x in ds[:4]["audio"]]

    inputs = processor(input_audio, return_tensors="pt", padding=True)
--- a/src/transformers/models/deprecated/mega/modeling_mega.py
+++ b/src/transformers/models/deprecated/mega/modeling_mega.py
@ -34,6 +34,7 @@ from ....modeling_outputs import (
    TokenClassifierOutput,
 )
 from ....modeling_utils import PreTrainedModel
+from ....pytorch_utils import ALL_LAYERNORM_LAYERS
 from ....utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
@ -310,6 +311,10 @@ class MegaSequenceNorm(nn.Module):
            return self.norm(input)


+# add this layernorm class to ALL_LAYERNORM_LAYERS
+ALL_LAYERNORM_LAYERS.append(MegaSequenceNorm)
+
+
 class MegaMultiDimensionDampedEma(nn.Module):
    """
    Mega's Exponential Moving Average layer, largely left unmodified from the original repo with the exception of
--- a/src/transformers/models/granitemoe/modeling_granitemoe.py
+++ b/src/transformers/models/granitemoe/modeling_granitemoe.py
@ -27,6 +27,7 @@ from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutputWithPast, MoeCausalLMOutputWithPast, MoeModelOutputWithPast
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...pytorch_utils import ALL_LAYERNORM_LAYERS
 from ...utils import auto_docstring, is_torch_flex_attn_available, logging
 from .configuration_granitemoe import GraniteMoeConfig

@ -144,6 +145,9 @@ class GraniteMoeRMSNorm(nn.Module):
        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"


+ALL_LAYERNORM_LAYERS.append(GraniteMoeRMSNorm)
+
+
 # Copied from transformers.models.granite.modeling_granite.GraniteRotaryEmbedding with Granite->GraniteMoe
 class GraniteMoeRotaryEmbedding(nn.Module):
    def __init__(self, config: GraniteMoeConfig, device=None):
--- a/src/transformers/models/idefics/modeling_idefics.py
+++ b/src/transformers/models/idefics/modeling_idefics.py
@ -35,6 +35,7 @@ from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_outputs import ModelOutput
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PretrainedConfig, PreTrainedModel
 from ...processing_utils import Unpack
+from ...pytorch_utils import ALL_LAYERNORM_LAYERS
 from ...utils import LossKwargs, auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging
 from .configuration_idefics import IdeficsConfig
 from .perceiver import IdeficsPerceiverResampler
@ -385,6 +386,9 @@ class IdeficsRMSNorm(nn.Module):
        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"


+ALL_LAYERNORM_LAYERS.append(IdeficsRMSNorm)
+
+
 # this was adapted from LlamaRotaryEmbedding
 class IdeficsEmbedding(torch.nn.Module):
    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
--- a/src/transformers/models/layoutlm/modeling_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_layoutlm.py
@ -1223,7 +1223,7 @@ class LayoutLMForQuestionAnswering(LayoutLMPreTrainedModel):
        >>> tokenizer = AutoTokenizer.from_pretrained("impira/layoutlm-document-qa", add_prefix_space=True)
        >>> model = LayoutLMForQuestionAnswering.from_pretrained("impira/layoutlm-document-qa", revision="1e3ebac")

-        >>> dataset = load_dataset("nielsr/funsd", split="train")
+        >>> dataset = load_dataset("nielsr/funsd", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> question = "what's his name?"
        >>> words = example["words"]
--- a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
@ -1601,7 +1601,7 @@ class TFLayoutLMForQuestionAnswering(TFLayoutLMPreTrainedModel, TFQuestionAnswer
        >>> tokenizer = AutoTokenizer.from_pretrained("impira/layoutlm-document-qa", add_prefix_space=True)
        >>> model = TFLayoutLMForQuestionAnswering.from_pretrained("impira/layoutlm-document-qa", revision="1e3ebac")

-        >>> dataset = load_dataset("nielsr/funsd", split="train")
+        >>> dataset = load_dataset("nielsr/funsd", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> question = "what's his name?"
        >>> words = example["words"]
--- a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
@ -763,8 +763,9 @@ class LayoutLMv2Model(LayoutLMv2PreTrainedModel):
        >>> model = LayoutLMv2Model.from_pretrained("microsoft/layoutlmv2-base-uncased")


-        >>> dataset = load_dataset("hf-internal-testing/fixtures_docvqa")
-        >>> image = dataset["test"][0]["image"]
+        >>> dataset = load_dataset("hf-internal-testing/fixtures_docvqa", trust_remote_code=True)
+        >>> image_path = dataset["test"][0]["file"]
+        >>> image = Image.open(image_path).convert("RGB")

        >>> encoding = processor(image, return_tensors="pt")

@ -952,7 +953,7 @@ class LayoutLMv2ForSequenceClassification(LayoutLMv2PreTrainedModel):

        >>> set_seed(0)

-        >>> dataset = load_dataset("aharley/rvl_cdip", split="train", streaming=True)
+        >>> dataset = load_dataset("aharley/rvl_cdip", split="train", streaming=True, trust_remote_code=True)
        >>> data = next(iter(dataset))
        >>> image = data["image"].convert("RGB")

@ -1154,7 +1155,7 @@ class LayoutLMv2ForTokenClassification(LayoutLMv2PreTrainedModel):

        >>> set_seed(0)

-        >>> datasets = load_dataset("nielsr/funsd", split="test")
+        >>> datasets = load_dataset("nielsr/funsd", split="test", trust_remote_code=True)
        >>> labels = datasets.features["ner_tags"].feature.names
        >>> id2label = {v: k for v, k in enumerate(labels)}

@ -1311,8 +1312,9 @@ class LayoutLMv2ForQuestionAnswering(LayoutLMv2PreTrainedModel):
        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased")
        >>> model = LayoutLMv2ForQuestionAnswering.from_pretrained("microsoft/layoutlmv2-base-uncased")

-        >>> dataset = load_dataset("hf-internal-testing/fixtures_docvqa")
-        >>> image = dataset["test"][0]["image"]
+        >>> dataset = load_dataset("hf-internal-testing/fixtures_docvqa", trust_remote_code=True)
+        >>> image_path = dataset["test"][0]["file"]
+        >>> image = Image.open(image_path).convert("RGB")
        >>> question = "When is coffee break?"
        >>> encoding = processor(image, question, return_tensors="pt")

--- a/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py
@ -746,7 +746,7 @@ class LayoutLMv3Model(LayoutLMv3PreTrainedModel):
        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = AutoModel.from_pretrained("microsoft/layoutlmv3-base")

-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
@ -961,7 +961,7 @@ class LayoutLMv3ForTokenClassification(LayoutLMv3PreTrainedModel):
        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = AutoModelForTokenClassification.from_pretrained("microsoft/layoutlmv3-base", num_labels=7)

-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
@ -1062,7 +1062,7 @@ class LayoutLMv3ForQuestionAnswering(LayoutLMv3PreTrainedModel):
        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = AutoModelForQuestionAnswering.from_pretrained("microsoft/layoutlmv3-base")

-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> question = "what's his name?"
@ -1182,7 +1182,7 @@ class LayoutLMv3ForSequenceClassification(LayoutLMv3PreTrainedModel):
        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = AutoModelForSequenceClassification.from_pretrained("microsoft/layoutlmv3-base")

-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
--- a/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py
@ -1296,7 +1296,7 @@ class TFLayoutLMv3Model(TFLayoutLMv3PreTrainedModel):
        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = TFAutoModel.from_pretrained("microsoft/layoutlmv3-base")

-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
@ -1439,7 +1439,7 @@ class TFLayoutLMv3ForSequenceClassification(TFLayoutLMv3PreTrainedModel, TFSeque
        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = TFAutoModelForSequenceClassification.from_pretrained("microsoft/layoutlmv3-base")

-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
@ -1566,7 +1566,7 @@ class TFLayoutLMv3ForTokenClassification(TFLayoutLMv3PreTrainedModel, TFTokenCla
        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = TFAutoModelForTokenClassification.from_pretrained("microsoft/layoutlmv3-base", num_labels=7)

-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
@ -1703,7 +1703,7 @@ class TFLayoutLMv3ForQuestionAnswering(TFLayoutLMv3PreTrainedModel, TFQuestionAn
        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = TFAutoModelForQuestionAnswering.from_pretrained("microsoft/layoutlmv3-base")

-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> question = "what's his name?"
--- a/src/transformers/models/lilt/modeling_lilt.py
+++ b/src/transformers/models/lilt/modeling_lilt.py
@ -653,7 +653,7 @@ class LiltModel(LiltPreTrainedModel):
        >>> tokenizer = AutoTokenizer.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")
        >>> model = AutoModel.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")

-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]
@ -793,7 +793,7 @@ class LiltForSequenceClassification(LiltPreTrainedModel):
        >>> tokenizer = AutoTokenizer.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")
        >>> model = AutoModelForSequenceClassification.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")

-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]
@ -908,7 +908,7 @@ class LiltForTokenClassification(LiltPreTrainedModel):
        >>> tokenizer = AutoTokenizer.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")
        >>> model = AutoModelForTokenClassification.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")

-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]
@ -1025,7 +1025,7 @@ class LiltForQuestionAnswering(LiltPreTrainedModel):
        >>> tokenizer = AutoTokenizer.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")
        >>> model = AutoModelForQuestionAnswering.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")

-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]
--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@ -40,6 +40,7 @@ from ...modeling_outputs import (
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
+from ...pytorch_utils import ALL_LAYERNORM_LAYERS
 from ...utils import LossKwargs, auto_docstring, can_return_tuple, logging
 from .configuration_llama import LlamaConfig

@ -68,6 +69,9 @@ class LlamaRMSNorm(nn.Module):
        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"


+ALL_LAYERNORM_LAYERS.append(LlamaRMSNorm)
+
+
 class LlamaRotaryEmbedding(nn.Module):
    def __init__(self, config: LlamaConfig, device=None):
        super().__init__()
--- a/src/transformers/models/longt5/modeling_longt5.py
+++ b/src/transformers/models/longt5/modeling_longt5.py
@ -34,7 +34,7 @@ from ...modeling_outputs import (
    Seq2SeqModelOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
    DUMMY_INPUTS,
    DUMMY_MASK,
@ -258,6 +258,8 @@ except Exception:
    logger.warning("discovered apex but it failed to load, falling back to LongT5LayerNorm")
    pass

+ALL_LAYERNORM_LAYERS.append(LongT5LayerNorm)
+

 # Copied from transformers.models.t5.modeling_t5.T5DenseActDense with T5->LongT5
 class LongT5DenseActDense(nn.Module):
--- a/src/transformers/models/modernbert/modeling_modernbert.py
+++ b/src/transformers/models/modernbert/modeling_modernbert.py
@ -154,7 +154,7 @@ class ModernBertUnpaddedRotaryEmbedding(RotaryEmbedding):
            up to max_seqlen. If the max_seqlen, device, or dtype during training/inference differ,
            the cos_sin_cache will be recomputed during the forward pass.
        """
-        super().__init__(dim=dim, base=base, device=device, interleaved=False)
+        super().__init__(dim=dim, base=base, pos_idx_in_fp32=True, device=device, interleaved=False)
        self.max_seqlen = max_seqlen

        if max_seqlen is not None and device is not None and dtype is not None:
--- a/src/transformers/models/modernbert/modular_modernbert.py
+++ b/src/transformers/models/modernbert/modular_modernbert.py
@ -417,7 +417,7 @@ class ModernBertUnpaddedRotaryEmbedding(RotaryEmbedding):
            up to max_seqlen. If the max_seqlen, device, or dtype during training/inference differ,
            the cos_sin_cache will be recomputed during the forward pass.
        """
-        super().__init__(dim=dim, base=base, device=device, interleaved=False)
+        super().__init__(dim=dim, base=base, pos_idx_in_fp32=True, device=device, interleaved=False)
        self.max_seqlen = max_seqlen

        if max_seqlen is not None and device is not None and dtype is not None:
--- a/src/transformers/models/moshi/modeling_moshi.py
+++ b/src/transformers/models/moshi/modeling_moshi.py
@ -31,6 +31,7 @@ from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask,
 from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, ModelOutput, Seq2SeqLMOutput
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import ALL_LAYERNORM_LAYERS
 from ...utils import auto_docstring, is_torch_flex_attn_available, is_torchdynamo_compiling, logging
 from ..auto.modeling_auto import AutoModel
 from .configuration_moshi import MoshiConfig, MoshiDepthConfig
@ -233,6 +234,9 @@ class MoshiRMSNorm(nn.Module):
        return f"{tuple(self.weight.shape)}, eps={self.eps}"


+ALL_LAYERNORM_LAYERS.append(MoshiRMSNorm)
+
+
 class MoshiFlexibleLinear(nn.Module):
    def __init__(self, input_size, output_size, num_layers):
        super().__init__()
--- a/src/transformers/models/nemotron/modeling_nemotron.py
+++ b/src/transformers/models/nemotron/modeling_nemotron.py
@ -37,6 +37,7 @@ from ...modeling_outputs import (
 )
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import ALL_LAYERNORM_LAYERS
 from ...utils import auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging
 from .configuration_nemotron import NemotronConfig

@ -84,6 +85,9 @@ class NemotronLayerNorm1P(nn.LayerNorm):
            return F.layer_norm(*args)


+ALL_LAYERNORM_LAYERS.append(NemotronLayerNorm1P)
+
+
 # Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with LLAMA->NEMOTRON,Llama->Nemotron,llama->nemotron
 class NemotronRotaryEmbedding(nn.Module):
    # Ignore copy
--- a/src/transformers/models/olmo2/modular_olmo2.py
+++ b/src/transformers/models/olmo2/modular_olmo2.py
@ -5,6 +5,7 @@ import torch.nn as nn

 from ...cache_utils import Cache
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
+from ...pytorch_utils import ALL_LAYERNORM_LAYERS
 from ...utils import logging
 from ..llama.modeling_llama import LlamaPreTrainedModel, LlamaRMSNorm, eager_attention_forward
 from ..olmo.configuration_olmo import OlmoConfig
@ -175,6 +176,9 @@ class Olmo2RMSNorm(LlamaRMSNorm):
        return (self.weight * hidden_states).to(input_dtype)


+ALL_LAYERNORM_LAYERS.append(Olmo2RMSNorm)
+
+
 def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
--- a/src/transformers/models/olmoe/modeling_olmoe.py
+++ b/src/transformers/models/olmoe/modeling_olmoe.py
@ -27,6 +27,7 @@ from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask,
 from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import ALL_LAYERNORM_LAYERS
 from ...utils import auto_docstring, logging
 from .configuration_olmoe import OlmoeConfig

@ -141,6 +142,9 @@ class OlmoeRMSNorm(nn.Module):
        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"


+ALL_LAYERNORM_LAYERS.append(OlmoeRMSNorm)
+
+
 # Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Olmoe
 class OlmoeRotaryEmbedding(nn.Module):
    def __init__(self, config: OlmoeConfig, device=None):
--- a/src/transformers/models/pix2struct/modeling_pix2struct.py
+++ b/src/transformers/models/pix2struct/modeling_pix2struct.py
@ -33,6 +33,7 @@ from ...modeling_outputs import (
    Seq2SeqModelOutput,
 )
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import ALL_LAYERNORM_LAYERS
 from ...utils import (
    DUMMY_INPUTS,
    DUMMY_MASK,
@ -95,6 +96,8 @@ except Exception:
    logger.warning("Discovered apex but it failed to load, falling back to Pix2StructLayerNorm")
    pass

+ALL_LAYERNORM_LAYERS.append(Pix2StructLayerNorm)
+

 class Pix2StructVisionEmbeddings(nn.Module):
    r"""
--- a/src/transformers/models/pop2piano/modeling_pop2piano.py
+++ b/src/transformers/models/pop2piano/modeling_pop2piano.py
@ -30,7 +30,7 @@ from ...generation import GenerationMixin
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions, Seq2SeqLMOutput
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import auto_docstring, is_torch_flex_attn_available, is_torch_fx_proxy, is_torchdynamo_compiling, logging
 from .configuration_pop2piano import Pop2PianoConfig

@ -88,6 +88,8 @@ class Pop2PianoLayerNorm(nn.Module):
 if not _load_pop2piano_layer_norm:
    Pop2PianoLayerNorm = FusedRMSNorm  # noqa

+ALL_LAYERNORM_LAYERS.append(Pop2PianoLayerNorm)
+

 # Copied from transformers.models.t5.modeling_t5.T5DenseActDense with T5->Pop2Piano,t5->pop2piano
 class Pop2PianoDenseActDense(nn.Module):
--- a/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py
+++ b/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py
@ -27,6 +27,7 @@ from ...generation import GenerationMixin
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_outputs import BaseModelOutputWithNoAttention, CausalLMOutput
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import ALL_LAYERNORM_LAYERS
 from ...utils import auto_docstring, logging
 from ...utils.import_utils import is_torchdynamo_compiling
 from .configuration_recurrent_gemma import RecurrentGemmaConfig
@ -57,6 +58,9 @@ class RecurrentGemmaRMSNorm(nn.Module):
        return f"{tuple(self.weight.shape)}, eps={self.eps}"


+ALL_LAYERNORM_LAYERS.append(RecurrentGemmaRMSNorm)
+
+
 class RecurrentGemmaRotaryEmbedding(nn.Module):
    def __init__(self, dim, base=10000, device=None):
        super().__init__()
--- a/src/transformers/models/speecht5/modeling_speecht5.py
+++ b/src/transformers/models/speecht5/modeling_speecht5.py
@ -2228,7 +2228,7 @@ class SpeechT5ForSpeechToText(SpeechT5PreTrainedModel, GenerationMixin):
        >>> from datasets import load_dataset

        >>> dataset = load_dataset(
-        ...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation"
+        ...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True
        ... )  # doctest: +IGNORE_RESULT
        >>> dataset = dataset.sort("id")
        >>> sampling_rate = dataset.features["audio"].sampling_rate
@ -2909,7 +2909,7 @@ class SpeechT5ForSpeechToSpeech(SpeechT5PreTrainedModel):
        >>> import torch

        >>> dataset = load_dataset(
-        ...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation"
+        ...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True
        ... )  # doctest: +IGNORE_RESULT
        >>> dataset = dataset.sort("id")
        >>> sampling_rate = dataset.features["audio"].sampling_rate
--- a/src/transformers/models/switch_transformers/modeling_switch_transformers.py
+++ b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
@ -34,7 +34,7 @@ from ...modeling_outputs import (
    Seq2SeqMoEOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
    DUMMY_INPUTS,
    DUMMY_MASK,
@ -240,6 +240,9 @@ class SwitchTransformersLayerNorm(nn.Module):
        return self.weight * hidden_states


+ALL_LAYERNORM_LAYERS.append(SwitchTransformersLayerNorm)
+
+
 # Copied from transformers.models.t5.modeling_t5.T5DenseActDense with T5->SwitchTransformers
 class SwitchTransformersDenseActDense(nn.Module):
    def __init__(self, config: SwitchTransformersConfig):
--- a/src/transformers/models/t5/modeling_t5.py
+++ b/src/transformers/models/t5/modeling_t5.py
@ -38,7 +38,7 @@ from ...modeling_outputs import (
    TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
    DUMMY_INPUTS,
    DUMMY_MASK,
@ -273,6 +273,8 @@ except Exception:
    logger.warning("discovered apex but it failed to load, falling back to T5LayerNorm")
    pass

+ALL_LAYERNORM_LAYERS.append(T5LayerNorm)
+

 class T5DenseActDense(nn.Module):
    def __init__(self, config: T5Config):
--- a/src/transformers/models/udop/modeling_udop.py
+++ b/src/transformers/models/udop/modeling_udop.py
@ -1604,7 +1604,7 @@ class UdopModel(UdopPreTrainedModel):

        >>> # load an example image, along with the words and coordinates
        >>> # which were extracted using an OCR engine
-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
@ -1813,7 +1813,7 @@ class UdopForConditionalGeneration(UdopPreTrainedModel, GenerationMixin):

        >>> # load an example image, along with the words and coordinates
        >>> # which were extracted using an OCR engine
-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
@ -2025,7 +2025,7 @@ class UdopEncoderModel(UdopPreTrainedModel):

        >>> # load an example image, along with the words and coordinates
        >>> # which were extracted using an OCR engine
-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
--- a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
@ -590,7 +590,7 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):
        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")

        >>> # load first sample of English common_voice
-        >>> dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", split="train", streaming=True)
+        >>> dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", split="train", streaming=True, trust_remote_code=True)
        >>> dataset = dataset.cast_column("audio", datasets.Audio(sampling_rate=16_000))
        >>> dataset_iter = iter(dataset)
        >>> sample = next(dataset_iter)
--- a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
+++ b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
@ -546,7 +546,7 @@ class Wav2Vec2ProcessorWithLM(ProcessorMixin):
        >>> processor = AutoProcessor.from_pretrained("patrickvonplaten/wav2vec2-base-100h-with-lm")

        >>> # load first sample of English common_voice
-        >>> dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", split="train", streaming=True)
+        >>> dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", split="train", streaming=True, trust_remote_code=True)
        >>> dataset = dataset.cast_column("audio", datasets.Audio(sampling_rate=16_000))
        >>> dataset_iter = iter(dataset)
        >>> sample = next(dataset_iter)
--- a/src/transformers/models/whisper/modeling_flax_whisper.py
+++ b/src/transformers/models/whisper/modeling_flax_whisper.py
@ -1670,7 +1670,7 @@ FLAX_WHISPER_AUDIO_CLASSIFICATION_DOCSTRING = r"""
    >>> model = FlaxWhisperForAudioClassification.from_pretrained(
    ...     "sanchit-gandhi/whisper-medium-fleurs-lang-id", from_pt=True
    ... )
-    >>> ds = load_dataset("google/fleurs", "all", split="validation", streaming=True)
+    >>> ds = load_dataset("google/fleurs", "all", split="validation", streaming=True, trust_remote_code=True)

    >>> sample = next(iter(ds))

--- a/src/transformers/models/zamba/modeling_zamba.py
+++ b/src/transformers/models/zamba/modeling_zamba.py
@ -35,6 +35,7 @@ from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
+from ...pytorch_utils import ALL_LAYERNORM_LAYERS
 from ...utils import auto_docstring, logging
 from ...utils.import_utils import is_causal_conv1d_available, is_mamba_ssm_available
 from .configuration_zamba import ZambaConfig
@ -80,6 +81,9 @@ class ZambaRMSNorm(nn.Module):
        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"


+ALL_LAYERNORM_LAYERS.append(ZambaRMSNorm)
+
+
 # Copied from transformers.models.llama.modeling_llama.repeat_kv
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    """
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@ -73,6 +73,7 @@ from .models.auto.modeling_auto import (
 from .optimization import Adafactor, get_scheduler
 from .processing_utils import ProcessorMixin
 from .pytorch_utils import (
+    ALL_LAYERNORM_LAYERS,
    is_torch_greater_or_equal_than_2_3,
 )
 from .tokenization_utils_base import PreTrainedTokenizerBase
@ -1185,10 +1186,9 @@ class Trainer:

        This function filters out parameters in two ways:
        1. By layer type (instances of layers specified in ALL_LAYERNORM_LAYERS)
-        2. By parameter name patterns (containing 'bias', or variation of 'norm')
+        2. By parameter name patterns (containing 'bias', 'layernorm', or 'rmsnorm')
        """
-        forbidden_name_patterns = [r"bias", r"layernorm", r"rmsnorm", r"(?:^|\.)norm(?:$|\.)", r"_norm(?:$|\.)"]
-        decay_parameters = get_parameter_names(model, [nn.LayerNorm], forbidden_name_patterns)
+        decay_parameters = get_parameter_names(model, ALL_LAYERNORM_LAYERS, ["bias", "layernorm", "rmsnorm"])
        return decay_parameters

    def create_optimizer(self):
--- a/src/transformers/trainer_pt_utils.py
+++ b/src/transformers/trainer_pt_utils.py
@ -21,7 +21,6 @@ import io
 import json
 import math
 import os
-import re
 import sys
 import warnings
 from collections.abc import Iterator, Mapping
@ -1125,9 +1124,8 @@ def get_parameter_names(model, forbidden_layer_types, forbidden_layer_names=None
    """
    Returns the names of the model parameters that are not inside a forbidden layer.
    """
-    forbidden_layer_patterns = (
-        [re.compile(pattern) for pattern in forbidden_layer_names] if forbidden_layer_names is not None else []
-    )
+    if forbidden_layer_names is None:
+        forbidden_layer_names = []
    result = []
    for name, child in model.named_children():
        child_params = get_parameter_names(child, forbidden_layer_types, forbidden_layer_names)
@ -1135,15 +1133,12 @@ def get_parameter_names(model, forbidden_layer_types, forbidden_layer_names=None
            f"{name}.{n}"
            for n in child_params
            if not isinstance(child, tuple(forbidden_layer_types))
-            and not any(pattern.search(f"{name}.{n}".lower()) for pattern in forbidden_layer_patterns)
+            and not any(forbidden in f"{name}.{n}".lower() for forbidden in forbidden_layer_names)
        ]
    # Add model specific parameters that are not in any child
    result += [
-        k
-        for k in model._parameters.keys()
-        if not any(pattern.search(k.lower()) for pattern in forbidden_layer_patterns)
+        k for k in model._parameters.keys() if not any(forbidden in k.lower() for forbidden in forbidden_layer_names)
    ]
-
    return result


--- a/src/transformers/utils/doc.py
+++ b/src/transformers/utils/doc.py
@ -423,7 +423,7 @@ PT_SPEECH_BASE_MODEL_SAMPLE = r"""
    >>> import torch
    >>> from datasets import load_dataset

-    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
    >>> dataset = dataset.sort("id")
    >>> sampling_rate = dataset.features["audio"].sampling_rate

@ -449,7 +449,7 @@ PT_SPEECH_CTC_SAMPLE = r"""
    >>> from datasets import load_dataset
    >>> import torch

-    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
    >>> dataset = dataset.sort("id")
    >>> sampling_rate = dataset.features["audio"].sampling_rate

@ -484,7 +484,7 @@ PT_SPEECH_SEQ_CLASS_SAMPLE = r"""
    >>> from datasets import load_dataset
    >>> import torch

-    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
    >>> dataset = dataset.sort("id")
    >>> sampling_rate = dataset.features["audio"].sampling_rate

@ -520,7 +520,7 @@ PT_SPEECH_FRAME_CLASS_SAMPLE = r"""
    >>> from datasets import load_dataset
    >>> import torch

-    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
    >>> dataset = dataset.sort("id")
    >>> sampling_rate = dataset.features["audio"].sampling_rate

@ -549,7 +549,7 @@ PT_SPEECH_XVECTOR_SAMPLE = r"""
    >>> from datasets import load_dataset
    >>> import torch

-    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
    >>> dataset = dataset.sort("id")
    >>> sampling_rate = dataset.features["audio"].sampling_rate

@ -584,7 +584,7 @@ PT_VISION_BASE_MODEL_SAMPLE = r"""
    >>> import torch
    >>> from datasets import load_dataset

-    >>> dataset = load_dataset("huggingface/cats-image")
+    >>> dataset = load_dataset("huggingface/cats-image", trust_remote_code=True)
    >>> image = dataset["test"]["image"][0]

    >>> image_processor = AutoImageProcessor.from_pretrained("{checkpoint}")
@ -609,7 +609,7 @@ PT_VISION_SEQ_CLASS_SAMPLE = r"""
    >>> import torch
    >>> from datasets import load_dataset

-    >>> dataset = load_dataset("huggingface/cats-image")
+    >>> dataset = load_dataset("huggingface/cats-image", trust_remote_code=True)
    >>> image = dataset["test"]["image"][0]

    >>> image_processor = AutoImageProcessor.from_pretrained("{checkpoint}")
@ -1194,7 +1194,7 @@ TF_SPEECH_BASE_MODEL_SAMPLE = r"""
    >>> from transformers import AutoProcessor, {model_class}
    >>> from datasets import load_dataset

-    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
    >>> dataset = dataset.sort("id")
    >>> sampling_rate = dataset.features["audio"].sampling_rate

@ -1219,7 +1219,7 @@ TF_SPEECH_CTC_SAMPLE = r"""
    >>> from datasets import load_dataset
    >>> import tensorflow as tf

-    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
    >>> dataset = dataset.sort("id")
    >>> sampling_rate = dataset.features["audio"].sampling_rate

@ -1254,7 +1254,7 @@ TF_VISION_BASE_MODEL_SAMPLE = r"""
    >>> from transformers import AutoImageProcessor, {model_class}
    >>> from datasets import load_dataset

-    >>> dataset = load_dataset("huggingface/cats-image")
+    >>> dataset = load_dataset("huggingface/cats-image", trust_remote_code=True)
    >>> image = dataset["test"]["image"][0]

    >>> image_processor = AutoImageProcessor.from_pretrained("{checkpoint}")
@ -1277,7 +1277,7 @@ TF_VISION_SEQ_CLASS_SAMPLE = r"""
    >>> import tensorflow as tf
    >>> from datasets import load_dataset

-    >>> dataset = load_dataset("huggingface/cats-image"))
+    >>> dataset = load_dataset("huggingface/cats-image", trust_remote_code=True)
    >>> image = dataset["test"]["image"][0]

    >>> image_processor = AutoImageProcessor.from_pretrained("{checkpoint}")
--- a/tests/deepspeed/test_model_zoo.py
+++ b/tests/deepspeed/test_model_zoo.py
@ -269,6 +269,7 @@ def make_task_cmds():
        "img_clas": f"""
        {scripts_dir}/image-classification/run_image_classification.py
            --dataset_name hf-internal-testing/cats_vs_dogs_sample
+            --trust_remote_code
            --remove_unused_columns False
            --max_steps 10
            --image_processor_name {DS_TESTS_DIRECTORY}/vit_feature_extractor.json
--- a/tests/models/beit/test_image_processing_beit.py
+++ b/tests/models/beit/test_image_processing_beit.py
@ -27,6 +27,8 @@ if is_torch_available():
    import torch

 if is_vision_available():
+    from PIL import Image
+
    from transformers import BeitImageProcessor

    if is_torchvision_available():
@ -96,14 +98,23 @@ class BeitImageProcessingTester:


 def prepare_semantic_single_inputs():
-    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
-    example = ds[0]
-    return example["image"], example["map"]
+    dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
+
+    image = Image.open(dataset[0]["file"])
+    map = Image.open(dataset[1]["file"])
+
+    return image, map


 def prepare_semantic_batch_inputs():
-    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
-    return list(ds["image"][:2]), list(ds["map"][:2])
+    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
+
+    image1 = Image.open(ds[0]["file"])
+    map1 = Image.open(ds[1]["file"])
+    image2 = Image.open(ds[2]["file"])
+    map2 = Image.open(ds[3]["file"])
+
+    return [image1, image2], [map1, map2]


@require_torch
--- a/tests/models/beit/test_modeling_beit.py
+++ b/tests/models/beit/test_modeling_beit.py
@ -16,6 +16,7 @@
 import unittest

 from datasets import load_dataset
+from packaging import version

 from transformers import BeitConfig
 from transformers.testing_utils import (
@ -52,6 +53,7 @@ if is_torch_available():


 if is_vision_available():
+    import PIL
    from PIL import Image

    from transformers import BeitImageProcessor
@ -502,8 +504,8 @@ class BeitModelIntegrationTest(unittest.TestCase):

        image_processor = BeitImageProcessor(do_resize=True, size=640, do_center_crop=False)

-        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
-        image = ds[0]["image"].convert("RGB")
+        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
+        image = Image.open(ds[0]["file"])
        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)

        # forward pass
@ -515,14 +517,27 @@ class BeitModelIntegrationTest(unittest.TestCase):
        expected_shape = torch.Size((1, 150, 160, 160))
        self.assertEqual(logits.shape, expected_shape)

-        expected_slice = torch.tensor(
-            [
-                [[-4.8963, -2.3696, -3.0359], [-2.8485, -0.9842, -1.7426], [-2.9453, -1.3338, -2.1463]],
-                [[-5.8099, -3.4140, -4.1025], [-3.8578, -2.2100, -3.0337], [-3.8383, -2.4615, -3.3681]],
-                [[-0.0314, 3.9864, 4.0536], [2.9637, 4.6879, 4.9976], [3.2074, 4.7690, 4.9946]],
-            ],
-            device=torch_device,
-        )
+        is_pillow_less_than_9 = version.parse(PIL.__version__) < version.parse("9.0.0")
+
+        if is_pillow_less_than_9:
+            expected_slice = torch.tensor(
+                [
+                    [[-4.9225, -2.3954, -3.0522], [-2.8822, -1.0046, -1.7561], [-2.9549, -1.3228, -2.1347]],
+                    [[-5.8168, -3.4129, -4.0778], [-3.8651, -2.2214, -3.0277], [-3.8356, -2.4643, -3.3535]],
+                    [[-0.0078, 3.9952, 4.0754], [2.9856, 4.6944, 5.0035], [3.2413, 4.7813, 4.9969]],
+                ],
+                device=torch_device,
+            )
+        else:
+            expected_slice = torch.tensor(
+                [
+                    [[-4.8960, -2.3688, -3.0355], [-2.8478, -0.9836, -1.7418], [-2.9449, -1.3332, -2.1456]],
+                    [[-5.8081, -3.4124, -4.1006], [-3.8561, -2.2081, -3.0323], [-3.8365, -2.4601, -3.3669]],
+                    [[-0.0309, 3.9868, 4.0540], [2.9640, 4.6877, 4.9976], [3.2081, 4.7690, 4.9942]],
+                ],
+                device=torch_device,
+            )
+
        torch.testing.assert_close(logits[0, :3, :3, :3], expected_slice, rtol=1e-4, atol=1e-4)

    @slow
@ -532,8 +547,8 @@ class BeitModelIntegrationTest(unittest.TestCase):

        image_processor = BeitImageProcessor(do_resize=True, size=640, do_center_crop=False)

-        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
-        image = ds[0]["image"].convert("RGB")
+        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
+        image = Image.open(ds[0]["file"])
        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)

        # forward pass
--- a/tests/models/data2vec/test_modeling_data2vec_audio.py
+++ b/tests/models/data2vec/test_modeling_data2vec_audio.py
@ -669,7 +669,7 @@ class Data2VecAudioModelIntegrationTest(unittest.TestCase):
        return [x["array"] for x in speech_samples]

    def _load_superb(self, task, num_samples):
-        ds = load_dataset("anton-l/superb_dummy", task, split="test")
+        ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True)

        return ds[:num_samples]

--- a/tests/models/dpt/test_image_processing_dpt.py
+++ b/tests/models/dpt/test_image_processing_dpt.py
@ -29,6 +29,8 @@ if is_torch_available():
    import torch

 if is_vision_available():
+    from PIL import Image
+
    from transformers import DPTImageProcessor

    if is_torchvision_available():
@ -92,15 +94,24 @@ class DPTImageProcessingTester:

 # Copied from transformers.tests.models.beit.test_image_processing_beit.prepare_semantic_single_inputs
 def prepare_semantic_single_inputs():
-    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
-    example = ds[0]
-    return example["image"], example["map"]
+    dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
+
+    image = Image.open(dataset[0]["file"])
+    map = Image.open(dataset[1]["file"])
+
+    return image, map


 # Copied from transformers.tests.models.beit.test_image_processing_beit.prepare_semantic_batch_inputs
 def prepare_semantic_batch_inputs():
-    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
-    return list(ds["image"][:2]), list(ds["map"][:2])
+    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
+
+    image1 = Image.open(ds[0]["file"])
+    map1 = Image.open(ds[1]["file"])
+    image2 = Image.open(ds[2]["file"])
+    map2 = Image.open(ds[3]["file"])
+
+    return [image1, image2], [map1, map2]


@require_torch
--- a/tests/models/granite_speech/test_modeling_granite_speech.py
+++ b/tests/models/granite_speech/test_modeling_granite_speech.py
@ -391,7 +391,7 @@ class GraniteSpeechForConditionalGenerationIntegrationTest(unittest.TestCase):

        EXPECTED_DECODED_TEXT = [
            "systemKnowledge Cutoff Date: April 2024.\nToday's Date: December 19, 2024.\nYou are Granite, developed by IBM. You are a helpful AI assistant\nusercan you transcribe the speech into a written format?\nassistantmister quilter is the apostle of the middle classes and we are glad to welcome his gospel",
-            "systemKnowledge Cutoff Date: April 2024.\nToday's Date: December 19, 2024.\nYou are Granite, developed by IBM. You are a helpful AI assistant\nusercan you transcribe the speech into a written format?\nassistantnor is mister quilter's manner less interesting than his matter"
+            "systemKnowledge Cutoff Date: April 2024.\nToday's Date: December 19, 2024.\nYou are Granite, developed by IBM. You are a helpful AI assistant\nusercan you transcribe the speech into a written format?\nassistantnor is mister quilp's manner less interesting than his matter"
        ]  # fmt: skip

        self.assertEqual(
--- a/tests/models/hubert/test_modeling_hubert.py
+++ b/tests/models/hubert/test_modeling_hubert.py
@ -767,7 +767,7 @@ class HubertModelIntegrationTest(unittest.TestCase):
    def _load_superb(self, task, num_samples):
        from datasets import load_dataset

-        ds = load_dataset("anton-l/superb_dummy", task, split="test")
+        ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True)

        return ds[:num_samples]

--- a/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py
@ -111,13 +111,13 @@ class LayoutLMv2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
    def test_layoutlmv2_integration_test(self):
        from datasets import load_dataset

-        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
+        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True)

        for image_processing_class in self.image_processor_list:
            # with apply_OCR = True
            image_processing = image_processing_class()

-            image = ds[0]["image"]
+            image = Image.open(ds[0]["file"]).convert("RGB")

            encoding = image_processing(image, return_tensors="pt")

--- a/tests/models/layoutlmv2/test_processor_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_processor_layoutlmv2.py
@ -28,6 +28,8 @@ from ...test_processing_common import ProcessorTesterMixin


 if is_pytesseract_available():
+    from PIL import Image
+
    from transformers import LayoutLMv2ImageProcessor


@ -154,11 +156,11 @@ class LayoutLMv2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        from datasets import load_dataset

        # set up
-        datasets = load_dataset("nielsr/funsd")
+        datasets = load_dataset("nielsr/funsd", trust_remote_code=True)
        processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", revision="no_ocr")

        def preprocess_data(examples):
-            images = [image.convert("RGB") for image in examples["image"]]
+            images = [Image.open(path).convert("RGB") for path in examples["image_path"]]
            words = examples["words"]
            boxes = examples["bboxes"]
            word_labels = examples["ner_tags"]
@ -190,8 +192,12 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
        # we verify our implementation on 2 document images from the DocVQA dataset
        from datasets import load_dataset

-        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
-        return ds[0]["image"].convert("RGB"), ds[1]["image"].convert("RGB")
+        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True)
+
+        image_1 = Image.open(ds[0]["file"]).convert("RGB")
+        image_2 = Image.open(ds[1]["file"]).convert("RGB")
+
+        return image_1, image_2

    @cached_property
    def get_tokenizers(self):
--- a/tests/models/layoutlmv3/test_image_processing_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_image_processing_layoutlmv3.py
@ -22,6 +22,8 @@ from ...test_image_processing_common import ImageProcessingTestMixin, prepare_im


 if is_pytesseract_available():
+    from PIL import Image
+
    from transformers import LayoutLMv3ImageProcessor

    if is_torchvision_available():
@ -104,13 +106,13 @@ class LayoutLMv3ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
    def test_LayoutLMv3_integration_test(self):
        from datasets import load_dataset

-        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
+        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True)

        # with apply_OCR = True
        for image_processing_class in self.image_processor_list:
            image_processor = image_processing_class()

-            image = ds[0]["image"].convert("RGB")
+            image = Image.open(ds[0]["file"]).convert("RGB")

            encoding = image_processor(image, return_tensors="pt")

--- a/tests/models/layoutlmv3/test_processor_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_processor_layoutlmv3.py
@ -28,6 +28,8 @@ from ...test_processing_common import ProcessorTesterMixin


 if is_pytesseract_available():
+    from PIL import Image
+
    from transformers import LayoutLMv3ImageProcessor


@ -170,8 +172,12 @@ class LayoutLMv3ProcessorIntegrationTests(unittest.TestCase):
        # we verify our implementation on 2 document images from the DocVQA dataset
        from datasets import load_dataset

-        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
-        return ds[0]["image"].convert("RGB"), ds[1]["image"].convert("RGB")
+        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True)
+
+        image_1 = Image.open(ds[0]["file"]).convert("RGB")
+        image_2 = Image.open(ds[1]["file"]).convert("RGB")
+
+        return image_1, image_2

    @cached_property
    def get_tokenizers(self):
--- a/tests/models/layoutxlm/test_processor_layoutxlm.py
+++ b/tests/models/layoutxlm/test_processor_layoutxlm.py
@ -33,6 +33,8 @@ from ...test_processing_common import ProcessorTesterMixin


 if is_pytesseract_available():
+    from PIL import Image
+
    from transformers import LayoutLMv2ImageProcessor


@ -160,11 +162,11 @@ class LayoutXLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        from datasets import load_dataset

        # set up
-        datasets = load_dataset("nielsr/funsd")
+        datasets = load_dataset("nielsr/funsd", trust_remote_code=True)
        processor = LayoutXLMProcessor.from_pretrained("microsoft/layoutxlm-base", apply_ocr=False)

        def preprocess_data(examples):
-            images = [image.convert("RGB") for image in examples["image"]]
+            images = [Image.open(path).convert("RGB") for path in examples["image_path"]]
            words = examples["words"]
            boxes = examples["bboxes"]
            word_labels = examples["ner_tags"]
@ -198,8 +200,12 @@ class LayoutXLMProcessorIntegrationTests(unittest.TestCase):
        # we verify our implementation on 2 document images from the DocVQA dataset
        from datasets import load_dataset

-        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
-        return ds[0]["image"].convert("RGB"), ds[1]["image"].convert("RGB")
+        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True)
+
+        image_1 = Image.open(ds[0]["file"]).convert("RGB")
+        image_2 = Image.open(ds[1]["file"]).convert("RGB")
+
+        return image_1, image_2

    @cached_property
    def get_tokenizers(self):
--- a/tests/models/mobilevit/test_image_processing_mobilevit.py
+++ b/tests/models/mobilevit/test_image_processing_mobilevit.py
@ -27,6 +27,8 @@ if is_torch_available():
    import torch

 if is_vision_available():
+    from PIL import Image
+
    from transformers import MobileViTImageProcessor


@ -84,14 +86,23 @@ class MobileViTImageProcessingTester:


 def prepare_semantic_single_inputs():
-    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
-    example = ds[0]
-    return example["image"], example["map"]
+    dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
+
+    image = Image.open(dataset[0]["file"])
+    map = Image.open(dataset[1]["file"])
+
+    return image, map


 def prepare_semantic_batch_inputs():
-    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
-    return list(ds["image"][:2]), list(ds["map"][:2])
+    dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
+
+    image1 = Image.open(dataset[0]["file"])
+    map1 = Image.open(dataset[1]["file"])
+    image2 = Image.open(dataset[2]["file"])
+    map2 = Image.open(dataset[3]["file"])
+
+    return [image1, image2], [map1, map2]


@require_torch
--- a/tests/models/nougat/test_image_processing_nougat.py
+++ b/tests/models/nougat/test_image_processing_nougat.py
@ -86,12 +86,8 @@ class NougatImageProcessingTester:
        return self.num_channels, self.size["height"], self.size["width"]

    def prepare_dummy_image(self):
-        revision = "ec57bf8c8b1653a209c13f6e9ee66b12df0fc2db"
        filepath = hf_hub_download(
-            repo_id="hf-internal-testing/fixtures_docvqa",
-            filename="nougat_pdf.png",
-            repo_type="dataset",
-            revision=revision,
+            repo_id="hf-internal-testing/fixtures_docvqa", filename="nougat_pdf.png", repo_type="dataset"
        )
        image = Image.open(filepath).convert("RGB")
        return image
@ -183,12 +179,8 @@ class NougatImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
        self.assertEqual((3, 100, 200), aligned_image.shape)

    def prepare_dummy_np_image(self):
-        revision = "ec57bf8c8b1653a209c13f6e9ee66b12df0fc2db"
        filepath = hf_hub_download(
-            repo_id="hf-internal-testing/fixtures_docvqa",
-            filename="nougat_pdf.png",
-            repo_type="dataset",
-            revision=revision,
+            repo_id="hf-internal-testing/fixtures_docvqa", filename="nougat_pdf.png", repo_type="dataset"
        )
        image = Image.open(filepath).convert("RGB")
        return np.array(image)
--- a/tests/models/perceiver/test_modeling_perceiver.py
+++ b/tests/models/perceiver/test_modeling_perceiver.py
@ -842,8 +842,11 @@ def prepare_img():

 # Helper functions for optical flow integration test
 def prepare_optical_flow_images():
-    ds = load_dataset("hf-internal-testing/fixtures_sintel", split="test")
-    return list(ds["image"][:2])
+    dataset = load_dataset("hf-internal-testing/fixtures_sintel", split="test", trust_remote_code=True)
+    image1 = Image.open(dataset[0]["file"]).convert("RGB")
+    image2 = Image.open(dataset[0]["file"]).convert("RGB")
+
+    return image1, image2


 def normalize(img):
--- a/tests/models/segformer/test_image_processing_segformer.py
+++ b/tests/models/segformer/test_image_processing_segformer.py
@ -27,6 +27,8 @@ if is_torch_available():
    import torch

 if is_vision_available():
+    from PIL import Image
+
    from transformers import SegformerImageProcessor


@ -84,14 +86,23 @@ class SegformerImageProcessingTester:


 def prepare_semantic_single_inputs():
-    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
-    example = ds[0]
-    return example["image"], example["map"]
+    dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
+
+    image = Image.open(dataset[0]["file"])
+    map = Image.open(dataset[1]["file"])
+
+    return image, map


 def prepare_semantic_batch_inputs():
-    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
-    return list(ds["image"][:2]), list(ds["map"][:2])
+    dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
+
+    image1 = Image.open(dataset[0]["file"])
+    map1 = Image.open(dataset[1]["file"])
+    image2 = Image.open(dataset[2]["file"])
+    map2 = Image.open(dataset[3]["file"])
+
+    return [image1, image2], [map1, map2]


@require_torch
--- a/tests/models/udop/test_modeling_udop.py
+++ b/tests/models/udop/test_modeling_udop.py
@ -16,9 +16,9 @@ import copy
 import inspect
 import unittest

-from datasets import load_dataset
+from huggingface_hub import hf_hub_download

-from transformers import UdopConfig, is_torch_available
+from transformers import UdopConfig, is_torch_available, is_vision_available
 from transformers.testing_utils import (
    require_sentencepiece,
    require_tokenizers,
@ -42,6 +42,10 @@ if is_torch_available():
    from transformers import UdopEncoderModel, UdopForConditionalGeneration, UdopModel, UdopProcessor


+if is_vision_available():
+    from PIL import Image
+
+
 class UdopModelTester:
    def __init__(
        self,
@ -614,8 +618,12 @@ class UdopEncoderOnlyModelTest(ModelTesterMixin, unittest.TestCase):
 class UdopModelIntegrationTests(unittest.TestCase):
    @cached_property
    def image(self):
-        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
-        return ds[1]["image"]
+        filepath = hf_hub_download(
+            repo_id="hf-internal-testing/fixtures_docvqa", filename="document_2.png", repo_type="dataset"
+        )
+        image = Image.open(filepath).convert("RGB")
+
+        return image

    @cached_property
    def processor(self):
--- a/tests/models/udop/test_processor_udop.py
+++ b/tests/models/udop/test_processor_udop.py
@ -41,6 +41,8 @@ if is_torch_available():


 if is_pytesseract_available():
+    from PIL import Image
+
    from transformers import LayoutLMv3ImageProcessor


@ -182,11 +184,11 @@ class UdopProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        from datasets import load_dataset

        # set up
-        datasets = load_dataset("nielsr/funsd")
+        datasets = load_dataset("nielsr/funsd", trust_remote_code=True)
        processor = UdopProcessor.from_pretrained("microsoft/udop-large", apply_ocr=False)

        def preprocess_data(examples):
-            images = [image.convert("RGB") for image in examples["image"]]
+            images = [Image.open(path).convert("RGB") for path in examples["image_path"]]
            words = examples["words"]
            boxes = examples["bboxes"]
            word_labels = examples["ner_tags"]
@ -220,8 +222,12 @@ class UdopProcessorIntegrationTests(unittest.TestCase):
        # we verify our implementation on 2 document images from the DocVQA dataset
        from datasets import load_dataset

-        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
-        return ds[0]["image"].convert("RGB"), ds[1]["image"].convert("RGB")
+        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True)
+
+        image_1 = Image.open(ds[0]["file"]).convert("RGB")
+        image_2 = Image.open(ds[1]["file"]).convert("RGB")
+
+        return image_1, image_2

    @cached_property
    def get_tokenizers(self):
--- a/tests/models/unispeech/test_modeling_unispeech.py
+++ b/tests/models/unispeech/test_modeling_unispeech.py
@ -566,7 +566,7 @@ class UniSpeechModelIntegrationTest(unittest.TestCase):
        return [x["array"] for x in speech_samples]

    def _load_superb(self, task, num_samples):
-        ds = load_dataset("anton-l/superb_dummy", task, split="test")
+        ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True)

        return ds[:num_samples]

--- a/tests/models/unispeech_sat/test_modeling_unispeech_sat.py
+++ b/tests/models/unispeech_sat/test_modeling_unispeech_sat.py
@ -820,7 +820,7 @@ class UniSpeechSatModelIntegrationTest(unittest.TestCase):
        return [x["array"] for x in speech_samples]

    def _load_superb(self, task, num_samples):
-        ds = load_dataset("anton-l/superb_dummy", task, split="test")
+        ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True)

        return ds[:num_samples]

--- a/tests/models/upernet/test_modeling_upernet.py
+++ b/tests/models/upernet/test_modeling_upernet.py
@ -15,7 +15,7 @@

 import unittest

-from datasets import load_dataset
+from huggingface_hub import hf_hub_download

 from transformers import ConvNextConfig, UperNetConfig
 from transformers.testing_utils import (
@ -41,6 +41,8 @@ if is_torch_available():


 if is_vision_available():
+    from PIL import Image
+
    from transformers import AutoImageProcessor


@ -275,8 +277,11 @@ class UperNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)

 # We will verify our results on an image of ADE20k
 def prepare_img():
-    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
-    return ds[0]["image"].convert("RGB")
+    filepath = hf_hub_download(
+        repo_id="hf-internal-testing/fixtures_ade20k", repo_type="dataset", filename="ADE_val_00000001.jpg"
+    )
+    image = Image.open(filepath).convert("RGB")
+    return image


@require_torch
@ -297,7 +302,7 @@ class UperNetModelIntegrationTest(unittest.TestCase):
        self.assertEqual(outputs.logits.shape, expected_shape)

        expected_slice = torch.tensor(
-            [[-7.5969, -7.5969, -7.4313], [-7.5969, -7.5969, -7.4313], [-7.4808, -7.4808, -7.3080]]
+            [[-7.5958, -7.5958, -7.4302], [-7.5958, -7.5958, -7.4302], [-7.4797, -7.4797, -7.3068]]
        ).to(torch_device)
        torch.testing.assert_close(outputs.logits[0, 0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4)

--- a/tests/models/vilt/test_modeling_vilt.py
+++ b/tests/models/vilt/test_modeling_vilt.py
@ -637,9 +637,9 @@ class ViltModelIntegrationTest(unittest.TestCase):

        processor = self.default_processor

-        dataset = load_dataset("hf-internal-testing/fixtures_nlvr2", split="train")
-        image1 = dataset[0]["image"]
-        image2 = dataset[1]["image"]
+        dataset = load_dataset("hf-internal-testing/fixtures_nlvr2", split="test", trust_remote_code=True)
+        image1 = Image.open(dataset[0]["file"]).convert("RGB")
+        image2 = Image.open(dataset[1]["file"]).convert("RGB")

        text = (
            "The left image contains twice the number of dogs as the right image, and at least two dogs in total are"
--- a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
+++ b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
@ -1149,8 +1149,8 @@ class TrOCRModelIntegrationTest(unittest.TestCase):
    def test_inference_handwritten(self):
        model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten").to(torch_device)

-        dataset = load_dataset("hf-internal-testing/fixtures_ocr", split="train")
-        image = dataset[1]["image"].convert("RGB")
+        dataset = load_dataset("hf-internal-testing/fixtures_ocr", split="test", trust_remote_code=True)
+        image = Image.open(dataset[0]["file"]).convert("RGB")

        processor = self.default_processor
        pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(torch_device)
@ -1174,8 +1174,8 @@ class TrOCRModelIntegrationTest(unittest.TestCase):
    def test_inference_printed(self):
        model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed").to(torch_device)

-        dataset = load_dataset("hf-internal-testing/fixtures_ocr", split="train")
-        image = dataset[0]["image"].convert("RGB")
+        dataset = load_dataset("hf-internal-testing/fixtures_ocr", split="test", trust_remote_code=True)
+        image = Image.open(dataset[1]["file"]).convert("RGB")

        processor = self.default_processor
        pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(torch_device)
--- a/tests/models/wav2vec2/test_modeling_wav2vec2.py
+++ b/tests/models/wav2vec2/test_modeling_wav2vec2.py
@ -97,7 +97,9 @@ def _test_wav2vec2_with_lm_invalid_pool(in_queue, out_queue, timeout):
    try:
        _ = in_queue.get(timeout=timeout)

-        ds = load_dataset("mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True)
+        ds = load_dataset(
+            "mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True, trust_remote_code=True
+        )
        sample = next(iter(ds))

        resampled_audio = torchaudio.functional.resample(
@ -1468,7 +1470,7 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
        return [x["array"] for x in speech_samples]

    def _load_superb(self, task, num_samples):
-        ds = load_dataset("anton-l/superb_dummy", task, split="test")
+        ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True)

        return ds[:num_samples]

@ -1834,7 +1836,9 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
    @require_pyctcdecode
    @require_torchaudio
    def test_wav2vec2_with_lm(self):
-        ds = load_dataset("mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True)
+        ds = load_dataset(
+            "mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True, trust_remote_code=True
+        )
        sample = next(iter(ds))

        resampled_audio = torchaudio.functional.resample(
@ -1858,7 +1862,9 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
    @require_pyctcdecode
    @require_torchaudio
    def test_wav2vec2_with_lm_pool(self):
-        ds = load_dataset("mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True)
+        ds = load_dataset(
+            "mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True, trust_remote_code=True
+        )
        sample = next(iter(ds))

        resampled_audio = torchaudio.functional.resample(
@ -1957,7 +1963,9 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
        LANG_MAP = {"it": "ita", "es": "spa", "fr": "fra", "en": "eng"}

        def run_model(lang):
-            ds = load_dataset("mozilla-foundation/common_voice_11_0", lang, split="test", streaming=True)
+            ds = load_dataset(
+                "mozilla-foundation/common_voice_11_0", lang, split="test", streaming=True, trust_remote_code=True
+            )
            sample = next(iter(ds))

            wav2vec2_lang = LANG_MAP[lang]
--- a/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
+++ b/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
@ -463,7 +463,9 @@ class Wav2Vec2ProcessorWithLMTest(unittest.TestCase):
    def test_word_time_stamp_integration(self):
        import torch

-        ds = load_dataset("mozilla-foundation/common_voice_11_0", "en", split="train", streaming=True)
+        ds = load_dataset(
+            "mozilla-foundation/common_voice_11_0", "en", split="train", streaming=True, trust_remote_code=True
+        )
        ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000))
        ds_iter = iter(ds)
        sample = next(ds_iter)
--- a/tests/models/wavlm/test_modeling_wavlm.py
+++ b/tests/models/wavlm/test_modeling_wavlm.py
@ -473,7 +473,7 @@ class WavLMModelIntegrationTest(unittest.TestCase):
        return [x["array"] for x in speech_samples]

    def _load_superb(self, task, num_samples):
-        ds = load_dataset("anton-l/superb_dummy", task, split="test")
+        ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True)

        return ds[:num_samples]

--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@ -1645,7 +1645,9 @@ class WhisperModelIntegrationTests(unittest.TestCase):
        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3")
        model.to(torch_device)

-        ds = load_dataset("facebook/multilingual_librispeech", "german", split="test", streaming=True)
+        ds = load_dataset(
+            "facebook/multilingual_librispeech", "german", split="test", streaming=True, trust_remote_code=True
+        )
        ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000))

        input_speech = next(iter(ds))["audio"]["array"]
@ -1712,10 +1714,11 @@ class WhisperModelIntegrationTests(unittest.TestCase):

        token = os.getenv("HF_HUB_READ_TOKEN", True)
        ds = load_dataset(
-            "hf-internal-testing/fixtures_common_voice",
+            "mozilla-foundation/common_voice_6_1",
            "ja",
            split="test",
            streaming=True,
+            trust_remote_code=True,
            token=token,
        )
        ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000))
@ -1725,10 +1728,7 @@ class WhisperModelIntegrationTests(unittest.TestCase):
            torch_device
        )

-        EXPECTED_TRANSCRIPTS = [
-            "夏の時期の時期でした",
-            " It was the time of day and all of the pens left during the summer.",
-        ]
+        EXPECTED_TRANSCRIPTS = ["木村さんに電話を貸してもらいました", " Kimura-san called me."]

        generated_ids = model.generate(
            input_features.repeat(2, 1, 1),
--- a/tests/pipelines/test_pipelines_audio_classification.py
+++ b/tests/pipelines/test_pipelines_audio_classification.py
@ -179,7 +179,7 @@ class AudioClassificationPipelineTests(unittest.TestCase):
        model = "superb/wav2vec2-base-superb-ks"

        audio_classifier = pipeline("audio-classification", model=model)
-        dataset = datasets.load_dataset("anton-l/superb_dummy", "ks", split="test")
+        dataset = datasets.load_dataset("anton-l/superb_dummy", "ks", split="test", trust_remote_code=True)

        audio = np.array(dataset[3]["speech"], dtype=np.float32)
        output = audio_classifier(audio, top_k=4)
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@ -265,7 +265,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
    @require_torch
    @require_pyctcdecode
    def test_large_model_pt_with_lm(self):
-        filename = hf_hub_download("Narsil/asr_dummy", filename="4.flac", repo_type="dataset")
+        dataset = load_dataset("Narsil/asr_dummy", streaming=True, trust_remote_code=True)
+        third_item = next(iter(dataset["test"].skip(3)))
+        filename = third_item["file"]

        speech_recognizer = pipeline(
            task="automatic-speech-recognition",
@ -386,7 +388,7 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
            chunk_length_s=8,
            stride_length_s=1,
        )
-        data = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True)
+        data = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True, trust_remote_code=True)
        sample = next(iter(data))

        res = pipe(sample["audio"]["array"])
@ -432,7 +434,7 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
            stride_length_s=1,
            return_language=True,
        )
-        data = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True)
+        data = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True, trust_remote_code=True)
        sample = next(iter(data))

        res = pipe(sample["audio"]["array"])
@ -487,7 +489,7 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
            task="automatic-speech-recognition",
            model="openai/whisper-tiny.en",
        )
-        data = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True)
+        data = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True, trust_remote_code=True)
        samples = [next(iter(data)) for _ in range(8)]
        audio = np.concatenate([sample["audio"]["array"] for sample in samples])

@ -1123,7 +1125,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
    @slow
    def test_speculative_decoding_whisper_non_distil(self):
        # Load data:
-        dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]")
+        dataset = load_dataset(
+            "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]", trust_remote_code=True
+        )
        sample = dataset[0]["audio"]

        # Load model:
@ -1165,7 +1169,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
    @slow
    def test_speculative_decoding_whisper_distil(self):
        # Load data:
-        dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]")
+        dataset = load_dataset(
+            "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]", trust_remote_code=True
+        )
        sample = dataset[0]["audio"]

        # Load model:
--- a/tests/pipelines/test_pipelines_image_segmentation.py
+++ b/tests/pipelines/test_pipelines_image_segmentation.py
@ -601,9 +601,9 @@ class ImageSegmentationPipelineTests(unittest.TestCase):

        image_segmenter = pipeline("image-segmentation", model=model, image_processor=image_processor)

-        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
-        image = ds[0]["image"].convert("RGB")
-        outputs = image_segmenter(image, threshold=threshold)
+        image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
+        file = image[0]["file"]
+        outputs = image_segmenter(file, threshold=threshold)

        # Shortening by hashing
        for o in outputs:
@ -655,9 +655,9 @@ class ImageSegmentationPipelineTests(unittest.TestCase):
    def test_oneformer(self):
        image_segmenter = pipeline(model="shi-labs/oneformer_ade20k_swin_tiny")

-        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
-        image = ds[0]["image"].convert("RGB")
-        outputs = image_segmenter(image, threshold=0.99)
+        image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
+        file = image[0]["file"]
+        outputs = image_segmenter(file, threshold=0.99)
        # Shortening by hashing
        for o in outputs:
            o["mask"] = mask_to_test_readable(o["mask"])
@ -679,7 +679,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase):
        )

        # Different task
-        outputs = image_segmenter(image, threshold=0.99, subtask="instance")
+        outputs = image_segmenter(file, threshold=0.99, subtask="instance")
        # Shortening by hashing
        for o in outputs:
            o["mask"] = mask_to_test_readable(o["mask"])
@ -701,7 +701,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase):
        )

        # Different task
-        outputs = image_segmenter(image, subtask="semantic")
+        outputs = image_segmenter(file, subtask="semantic")
        # Shortening by hashing
        for o in outputs:
            o["mask"] = mask_to_test_readable(o["mask"])
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@ -3795,10 +3795,6 @@ class ModelTesterMixin:
                self.skipTest(
                    "PaliGemma-like models currently (transformers==4.41.0) requires an attention_mask input"
                )
-            if config.model_type in ["modernbert"]:
-                self.skipTest(
-                    reason="ModernBert currently (transformers==4.52.0) automatically adds an attention_mask input"
-                )
            if config.model_type in ["idefics", "idefics2", "idefics3"]:
                self.skipTest(reason="Idefics currently (transformers==4.39.1) requires an image_attention_mask input")
            if config.model_type in ["sam"]:
--- a/utils/split_model_tests.py
+++ b/utils/split_model_tests.py
@ -62,4 +62,5 @@ if __name__ == "__main__":
        start = end
        end = start + num_jobs_per_splits + (1 if idx < num_jobs % args.num_splits else 0)
        model_splits.append(d[start:end])
+    # model_splits = [["models/vit"], ["models/clip"]]
    print(model_splits)
Author	SHA1	Message	Date
ydshieh	0eee6fe111	trigger ci on A10	2025-06-20 14:01:51 +02:00
ydshieh	d6b17c2ce8	trigger ci on A10	2025-06-20 12:59:58 +02:00