try

trigger
[test all]
2025-11-06 13:34:37 +08:00 · 2025-06-19 18:57:58 +02:00 · 2025-06-19 18:54:02 +02:00 · 2025-06-19 18:53:54 +02:00 · 2025-06-19 18:45:07 +02:00 · 2025-06-19 18:43:59 +02:00
65 changed files with 822 additions and 430 deletions
--- a/.github/workflows/model_jobs.yml
+++ b/.github/workflows/model_jobs.yml
@ -12,8 +12,8 @@ on:
      slice_id:
        required: true
        type: number
-      runner:
+      runner_map:
-        required: true
+        required: false
        type: string
      docker:
        required: true
@ -45,7 +45,7 @@ jobs:
      matrix:
        folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }}
    runs-on:
-      group: '${{ inputs.machine_type }}'
+      group: ${{ fromJson(inputs.runner_map)[matrix.folders][inputs.machine_type] }}
    container:
      image: ${{ inputs.docker }}
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
--- a/.github/workflows/model_jobs_amd.yml
+++ b/.github/workflows/model_jobs_amd.yml
@ -1,128 +0,0 @@
 name: model jobs
 on:
  workflow_call:
    inputs:
      folder_slices:
        required: true
        type: string
      machine_type:
        required: true
        type: string
      slice_id:
        required: true
        type: number
      runner:
        required: true
        type: string
      docker:
        required: true
        type: string
 env:
  HF_HOME: /mnt/cache
  TRANSFORMERS_IS_CI: yes
  OMP_NUM_THREADS: 8
  MKL_NUM_THREADS: 8
  RUN_SLOW: yes
  # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access.
  # This token is created under the bot `hf-transformers-bot`.
  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
  TF_FORCE_GPU_ALLOW_GROWTH: true
  CUDA_VISIBLE_DEVICES: 0,1
 jobs:
  run_models_gpu:
    name: " "
    strategy:
      max-parallel: 1  # For now, not to parallelize. Can change later if it works well.
      fail-fast: false
      matrix:
        folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }}
    runs-on: ['${{ inputs.machine_type }}', self-hosted, amd-gpu, '${{ inputs.runner }}']
    container:
      image: ${{ inputs.docker }}
      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: Echo input and matrix info
        shell: bash
        run: |
          echo "${{ inputs.folder_slices }}"
          echo "${{ matrix.folders }}"
          echo "${{ toJson(fromJson(inputs.folder_slices)[inputs.slice_id]) }}"
      - name: Echo folder ${{ matrix.folders }}
        shell: bash
        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
        # set the artifact folder names (because the character `/` is not allowed).
        run: |
          echo "${{ matrix.folders }}"
          matrix_folders=${{ matrix.folders }}
          matrix_folders=${matrix_folders/'models/'/'models_'}
          echo "$matrix_folders"
          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
      - name: Update clone
        working-directory: /transformers
        run: git fetch && git checkout ${{ github.sha }}
      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: /transformers
        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
      - name: Update / Install some packages (for Past CI)
        if: ${{ contains(inputs.docker, '-past-') }}
        working-directory: /transformers
        run: |
          python3 -m pip install -U datasets
      - name: Update / Install some packages (for Past CI)
        if: ${{ contains(inputs.docker, '-past-') && contains(inputs.docker, '-pytorch-') }}
        working-directory: /transformers
        run: |
          python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
      - name: ROCM-SMI
        run: |
          rocm-smi
      - name: ROCM-INFO
        run: |
          rocminfo  | grep "Agent" -A 14
      - name: Show ROCR environment
        run: |
          echo "ROCR: $ROCR_VISIBLE_DEVICES"
      - name: Environment
        working-directory: /transformers
        run: |
          python3 utils/print_env.py
      - name: Show installed libraries and their versions
        working-directory: /transformers
        run: pip freeze
      - name: Run all tests on GPU
        working-directory: /transformers
        run: python3 -m pytest -rsfE -v --make-reports=${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}  -m "not not_device_test"
      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
        run: cat /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
      - name: Run test
        shell: bash
        run: |
          mkdir -p /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
          echo "hello" > /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt
          echo "${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports"
      - name: "Test suite reports artifacts: ${{ inputs.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports"
        if: ${{ always() }}
        uses: actions/upload-artifact@v4
        with:
          name: ${{ inputs.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
          path: /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
--- a/.github/workflows/self-scheduled-caller.yml
+++ b/.github/workflows/self-scheduled-caller.yml
@ -7,7 +7,7 @@ on:
    - cron: "17 2 * * *"
  push:
    branches:
-      - trigger-remove-script-datasets-in-tests
+      - allow_ci_to_use_a10
  workflow_dispatch:
    inputs:
      prev_workflow_run_id:
@ -22,10 +22,10 @@ on:
        default: ""
-# Used for `push` to easily modiffy the target workflow runs to compare against
+# Used for `push` to easily modify the target workflow runs to compare against
 env:
    prev_workflow_run_id: ""
-    other_workflow_run_id: "15770139098"
+    other_workflow_run_id: ""
 jobs:
@ -51,8 +51,68 @@ jobs:
    with:
      job: run_models_gpu
      slack_report_channel: "#transformers-ci-daily-models"
      runner: daily-ci
      docker: huggingface/transformers-all-latest-gpu
      ci_event: Daily CI
      report_repo_id: hf-internal-testing/transformers_daily_ci
    secrets: inherit
 #  torch-pipeline:
 #    name: Torch pipeline CI
 #    uses: ./.github/workflows/self-scheduled.yml
 #    with:
 #      job: run_pipelines_torch_gpu
 #      slack_report_channel: "#transformers-ci-daily-pipeline-torch"
 #      runner: daily-ci
 #      docker: huggingface/transformers-pytorch-gpu
 #      ci_event: Daily CI
 #      report_repo_id: hf-internal-testing/transformers_daily_ci
 #    secrets: inherit
 #
 #  example-ci:
 #    name: Example CI
 #    uses: ./.github/workflows/self-scheduled.yml
 #    with:
 #      job: run_examples_gpu
 #      slack_report_channel: "#transformers-ci-daily-examples"
 #      runner: daily-ci
 #      docker: huggingface/transformers-all-latest-gpu
 #      ci_event: Daily CI
 #      report_repo_id: hf-internal-testing/transformers_daily_ci
 #    secrets: inherit
 #
 #  trainer-fsdp-ci:
 #    name: Trainer/FSDP CI
 #    uses: ./.github/workflows/self-scheduled.yml
 #    with:
 #      job: run_trainer_and_fsdp_gpu
 #      slack_report_channel: "#transformers-ci-daily-training"
 #      runner: daily-ci
 #      docker: huggingface/transformers-all-latest-gpu
 #      ci_event: Daily CI
 #      report_repo_id: hf-internal-testing/transformers_daily_ci
 #    secrets: inherit
 #
 #  deepspeed-ci:
 #    name: DeepSpeed CI
 #    uses: ./.github/workflows/self-scheduled.yml
 #    with:
 #      job: run_torch_cuda_extensions_gpu
 #      slack_report_channel: "#transformers-ci-daily-training"
 #      runner: daily-ci
 #      docker: huggingface/transformers-pytorch-deepspeed-latest-gpu
 #      ci_event: Daily CI
 #      working-directory-prefix: /workspace
 #      report_repo_id: hf-internal-testing/transformers_daily_ci
 #    secrets: inherit
 #
 #  quantization-ci:
 #    name: Quantization CI
 #    uses: ./.github/workflows/self-scheduled.yml
 #    with:
 #      job: run_quantization_torch_gpu
 #      slack_report_channel: "#transformers-ci-daily-quantization"
 #      runner: daily-ci
 #      docker: huggingface/transformers-quantization-latest-gpu
 #      ci_event: Daily CI
 #      report_repo_id: hf-internal-testing/transformers_daily_ci
 #    secrets: inherit
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -15,9 +15,6 @@ on:
      slack_report_channel:
        required: true
        type: string
      runner:
        required: true
        type: string
      docker:
        required: true
        type: string
@ -62,6 +59,7 @@ jobs:
    outputs:
      folder_slices: ${{ steps.set-matrix.outputs.folder_slices }}
      slice_ids: ${{ steps.set-matrix.outputs.slice_ids }}
      runner_map: ${{ steps.set-matrix.outputs.runner_map }}
      quantization_matrix: ${{ steps.set-matrix-quantization.outputs.quantization_matrix }}
    steps:
      - name: Update clone
@ -88,6 +86,7 @@ jobs:
          if [ "${{ inputs.job }}" = "run_models_gpu" ]; then
            echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
            echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
            echo "runner_map=$(python3 ../utils/get_runner_map.py)" >> $GITHUB_OUTPUT
          elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then
            echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT
            echo "slice_ids=[0, 1]" >> $GITHUB_OUTPUT
@ -111,14 +110,14 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [single-gpu, multi-gpu]
        slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
    uses: ./.github/workflows/model_jobs.yml
    with:
      folder_slices: ${{ needs.setup.outputs.folder_slices }}
      machine_type: ${{ matrix.machine_type }}
      slice_id: ${{ matrix.slice_id }}
-      runner: ${{ inputs.runner }}
+      runner_map: ${{ needs.setup.outputs.runner_map }}
      docker: ${{ inputs.docker }}
    secrets: inherit
@ -136,7 +135,6 @@ jobs:
      folder_slices: ${{ needs.setup.outputs.folder_slices }}
      machine_type: ${{ matrix.machine_type }}
      slice_id: ${{ matrix.slice_id }}
      runner: ${{ inputs.runner }}
      docker: ${{ inputs.docker }}
      report_name_prefix: run_trainer_and_fsdp_gpu
    secrets: inherit
--- a/18
+++ b/18
@ -8,19 +8,13 @@ check_dirs := examples tests src utils
 exclude_folders :=  ""
 modified_only_fixup:
-	@current_branch=$$(git branch --show-current); \
+	$(eval modified_py_files := $(shell python utils/get_modified_files.py $(check_dirs)))
-	if [ "$$current_branch" = "main" ]; then \
+	@if test -n "$(modified_py_files)"; then \
-		echo "On main branch, running 'style' target instead..."; \
+		echo "Checking/fixing $(modified_py_files)"; \
-		$(MAKE) style; \
+		ruff check $(modified_py_files) --fix --exclude $(exclude_folders); \
 		ruff format $(modified_py_files) --exclude $(exclude_folders);\
 	else \
-		modified_py_files=$$(python utils/get_modified_files.py $(check_dirs)); \
+		echo "No library .py files were modified"; \
 		if [ -n "$$modified_py_files" ]; then \
 			echo "Checking/fixing files: $${modified_py_files}"; \
 			ruff check $${modified_py_files} --fix --exclude $(exclude_folders); \
 			ruff format $${modified_py_files} --exclude $(exclude_folders); \
 		else \
 			echo "No library .py files were modified"; \
 		fi; \
 	fi
 # Update src/transformers/dependency_versions_table.py
--- a/docs/source/en/model_doc/seamless_m4t.md
+++ b/docs/source/en/model_doc/seamless_m4t.md
@ -56,7 +56,7 @@ Here is how to use the processor to process text and audio:
 ```python
 >>> # let's load an audio sample from an Arabic speech corpus
 >>> from datasets import load_dataset
->>> dataset = load_dataset("halabi2016/arabic_speech_corpus", split="test", streaming=True)
+>>> dataset = load_dataset("arabic_speech_corpus", split="test", streaming=True, trust_remote_code=True)
 >>> audio_sample = next(iter(dataset))["audio"]
 >>> # now, process it
--- a/docs/source/en/model_doc/seamless_m4t_v2.md
+++ b/docs/source/en/model_doc/seamless_m4t_v2.md
@ -56,7 +56,7 @@ Here is how to use the processor to process text and audio:
 ```python
 >>> # let's load an audio sample from an Arabic speech corpus
 >>> from datasets import load_dataset
->>> dataset = load_dataset("halabi2016/arabic_speech_corpus", split="test", streaming=True)
+>>> dataset = load_dataset("arabic_speech_corpus", split="test", streaming=True, trust_remote_code=True)
 >>> audio_sample = next(iter(dataset))["audio"]
 >>> # now, process it
--- a/docs/source/en/trainer.md
+++ b/docs/source/en/trainer.md
@ -493,33 +493,6 @@ training_args = TrainingArguments(
 )
 ```
 You can also configure which specific kernels to apply using the `liger_kernel_config` parameter. This dict is passed as keyword arguments to the `_apply_liger_kernel_to_instance` function, allowing fine-grained control over kernel usage. Available options vary by model but typically include: `rope`, `swiglu`, `cross_entropy`, `fused_linear_cross_entropy`, `rms_norm`, etc.
 ```py
 from transformers import TrainingArguments
 # Apply only specific kernels
 training_args = TrainingArguments(
    output_dir="your-model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
    use_liger_kernel=True,
    liger_kernel_config={
        "rope": True,
        "cross_entropy": True,
        "rms_norm": False,  # Don't apply Liger's RMSNorm kernel
        "swiglu": True,
    }
 )
 ```
 ### NEFTune
 [NEFTune](https://hf.co/papers/2310.05914) adds noise to the embedding vectors during training to improve model performance. Enable it in [`Trainer`] with the `neftune_noise_alpha` parameter in [`TrainingArguments`] to control how much noise is added.
--- a/examples/flax/test_flax_examples.py
+++ b/examples/flax/test_flax_examples.py
@ -264,6 +264,7 @@ class ExamplesTests(TestCasePlus):
            --dataset_config clean
            --train_split_name validation
            --eval_split_name validation
            --trust_remote_code
            --output_dir {tmp_dir}
            --overwrite_output_dir
            --num_train_epochs=2
--- a/examples/pytorch/test_accelerate_examples.py
+++ b/examples/pytorch/test_accelerate_examples.py
@ -312,6 +312,7 @@ class ExamplesTestsNoTrainer(TestCasePlus):
            {self.examples_dir}/pytorch/image-classification/run_image_classification_no_trainer.py
            --model_name_or_path google/vit-base-patch16-224-in21k
            --dataset_name hf-internal-testing/cats_vs_dogs_sample
            --trust_remote_code
            --learning_rate 1e-4
            --per_device_train_batch_size 2
            --per_device_eval_batch_size 1
--- a/examples/pytorch/test_pytorch_examples.py
+++ b/examples/pytorch/test_pytorch_examples.py
@ -390,6 +390,7 @@ class ExamplesTests(TestCasePlus):
            --output_dir {tmp_dir}
            --model_name_or_path google/vit-base-patch16-224-in21k
            --dataset_name hf-internal-testing/cats_vs_dogs_sample
            --trust_remote_code
            --do_train
            --do_eval
            --learning_rate 1e-4
@ -423,6 +424,7 @@ class ExamplesTests(TestCasePlus):
            --dataset_config_name clean
            --train_split_name validation
            --eval_split_name validation
            --trust_remote_code
            --do_train
            --do_eval
            --learning_rate 1e-4
@ -453,6 +455,7 @@ class ExamplesTests(TestCasePlus):
            --dataset_config_name clean
            --train_split_name validation
            --eval_split_name validation
            --trust_remote_code
            --do_train
            --do_eval
            --learning_rate 1e-4
@ -485,6 +488,7 @@ class ExamplesTests(TestCasePlus):
            --dataset_config_name clean
            --train_split_name validation
            --eval_split_name validation
            --trust_remote_code
            --do_train
            --do_eval
            --learning_rate 1e-4
@ -512,6 +516,7 @@ class ExamplesTests(TestCasePlus):
            --output_dir {tmp_dir}
            --model_name_or_path hf-internal-testing/tiny-random-wav2vec2
            --dataset_name anton-l/superb_demo
            --trust_remote_code
            --dataset_config_name ks
            --train_split_name test
            --eval_split_name test
@ -546,6 +551,7 @@ class ExamplesTests(TestCasePlus):
            --dataset_name hf-internal-testing/librispeech_asr_dummy
            --dataset_config_names clean
            --dataset_split_names validation
            --trust_remote_code
            --learning_rate 1e-4
            --per_device_train_batch_size 4
            --per_device_eval_batch_size 4
@ -566,6 +572,7 @@ class ExamplesTests(TestCasePlus):
            run_mae.py
            --output_dir {tmp_dir}
            --dataset_name hf-internal-testing/cats_vs_dogs_sample
            --trust_remote_code
            --do_train
            --do_eval
            --learning_rate 1e-4
--- a/examples/tensorflow/test_tensorflow_examples.py
+++ b/examples/tensorflow/test_tensorflow_examples.py
@ -315,6 +315,7 @@ class ExamplesTests(TestCasePlus):
        testargs = f"""
            run_image_classification.py
            --dataset_name hf-internal-testing/cats_vs_dogs_sample
            --trust_remote_code
            --model_name_or_path microsoft/resnet-18
            --do_train
            --do_eval
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@ -29,6 +29,7 @@ import warnings
 from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from contextlib import contextmanager
 from dataclasses import dataclass
 from enum import Enum
 from functools import partial, wraps
 from threading import Thread
@ -40,6 +41,7 @@ from huggingface_hub import split_torch_state_dict_into_shards
 from packaging import version
 from torch import Tensor, nn
 from torch.distributions import constraints
 from torch.nn import CrossEntropyLoss, Identity
 from torch.utils.checkpoint import checkpoint
 from transformers.utils import is_torchao_available
@ -48,6 +50,7 @@ from transformers.utils import is_torchao_available
 if is_torchao_available():
    from torchao.quantization import Int4WeightOnlyConfig
 from .activations import get_activation
 from .configuration_utils import PretrainedConfig
 from .dynamic_module_utils import custom_object_save
 from .generation import CompileConfig, GenerationConfig
@ -95,6 +98,7 @@ from .utils import (
    WEIGHTS_INDEX_NAME,
    WEIGHTS_NAME,
    ContextManagers,
    ModelOutput,
    PushToHubMixin,
    cached_file,
    check_torch_load_is_safe,
@ -119,6 +123,7 @@ from .utils import (
    is_torch_xla_available,
    is_torch_xpu_available,
    logging,
    replace_return_docstrings,
    strtobool,
 )
 from .utils.generic import GeneralInterface
@ -5619,6 +5624,453 @@ if PreTrainedModel.push_to_hub.__doc__ is not None:
    )
 class PoolerStartLogits(nn.Module):
    """
    Compute SQuAD start logits from sequence hidden states.
    Args:
        config ([`PretrainedConfig`]):
            The config used by the model, will be used to grab the `hidden_size` of the model.
    """
    def __init__(self, config: PretrainedConfig):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, 1)
        logger.warning_once(
            "[DEPRECATION WARNING] `PoolerStartLogits` is deprecated and will be removed in v4.53. "
            "Please use model-specific class, e.g. `XLMPoolerStartLogits`."
        )
    def forward(
        self, hidden_states: torch.FloatTensor, p_mask: Optional[torch.FloatTensor] = None
    ) -> torch.FloatTensor:
        """
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
                The final hidden states of the model.
            p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
                should be masked.
        Returns:
            `torch.FloatTensor`: The start logits for SQuAD.
        """
        x = self.dense(hidden_states).squeeze(-1)
        if p_mask is not None:
            if get_parameter_dtype(self) == torch.float16:
                x = x * (1 - p_mask) - 65500 * p_mask
            else:
                x = x * (1 - p_mask) - 1e30 * p_mask
        return x
 class PoolerEndLogits(nn.Module):
    """
    Compute SQuAD end logits from sequence hidden states.
    Args:
        config ([`PretrainedConfig`]):
            The config used by the model, will be used to grab the `hidden_size` of the model and the `layer_norm_eps`
            to use.
    """
    def __init__(self, config: PretrainedConfig):
        super().__init__()
        self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
        self.activation = nn.Tanh()
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dense_1 = nn.Linear(config.hidden_size, 1)
        logger.warning_once(
            "[DEPRECATION WARNING] `PoolerEndLogits` is deprecated and will be removed in v4.53. "
            "Please use model-specific class, e.g. `XLMPoolerEndLogits`."
        )
    def forward(
        self,
        hidden_states: torch.FloatTensor,
        start_states: Optional[torch.FloatTensor] = None,
        start_positions: Optional[torch.LongTensor] = None,
        p_mask: Optional[torch.FloatTensor] = None,
    ) -> torch.FloatTensor:
        """
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
                The final hidden states of the model.
            start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*):
                The hidden states of the first tokens for the labeled span.
            start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                The position of the first token for the labeled span.
            p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
                should be masked.
        <Tip>
        One of `start_states` or `start_positions` should be not `None`. If both are set, `start_positions` overrides
        `start_states`.
        </Tip>
        Returns:
            `torch.FloatTensor`: The end logits for SQuAD.
        """
        assert start_states is not None or start_positions is not None, (
            "One of start_states, start_positions should be not None"
        )
        if start_positions is not None:
            slen, hsz = hidden_states.shape[-2:]
            start_positions = start_positions[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
            start_states = hidden_states.gather(-2, start_positions)  # shape (bsz, 1, hsz)
            start_states = start_states.expand(-1, slen, -1)  # shape (bsz, slen, hsz)
        x = self.dense_0(torch.cat([hidden_states, start_states], dim=-1))
        x = self.activation(x)
        x = self.LayerNorm(x)
        x = self.dense_1(x).squeeze(-1)
        if p_mask is not None:
            if get_parameter_dtype(self) == torch.float16:
                x = x * (1 - p_mask) - 65500 * p_mask
            else:
                x = x * (1 - p_mask) - 1e30 * p_mask
        return x
 class PoolerAnswerClass(nn.Module):
    """
    Compute SQuAD 2.0 answer class from classification and start tokens hidden states.
    Args:
        config ([`PretrainedConfig`]):
            The config used by the model, will be used to grab the `hidden_size` of the model.
    """
    def __init__(self, config):
        super().__init__()
        self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
        self.activation = nn.Tanh()
        self.dense_1 = nn.Linear(config.hidden_size, 1, bias=False)
        logger.warning_once(
            "[DEPRECATION WARNING] `PoolerAnswerClass` is deprecated and will be removed in v4.53. "
            "Please use model-specific class, e.g. `XLMPoolerAnswerClass`."
        )
    def forward(
        self,
        hidden_states: torch.FloatTensor,
        start_states: Optional[torch.FloatTensor] = None,
        start_positions: Optional[torch.LongTensor] = None,
        cls_index: Optional[torch.LongTensor] = None,
    ) -> torch.FloatTensor:
        """
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
                The final hidden states of the model.
            start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*):
                The hidden states of the first tokens for the labeled span.
            start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                The position of the first token for the labeled span.
            cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                Position of the CLS token for each sentence in the batch. If `None`, takes the last token.
        <Tip>
        One of `start_states` or `start_positions` should be not `None`. If both are set, `start_positions` overrides
        `start_states`.
        </Tip>
        Returns:
            `torch.FloatTensor`: The SQuAD 2.0 answer class.
        """
        # No dependency on end_feature so that we can obtain one single `cls_logits` for each sample.
        hsz = hidden_states.shape[-1]
        assert start_states is not None or start_positions is not None, (
            "One of start_states, start_positions should be not None"
        )
        if start_positions is not None:
            start_positions = start_positions[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
            start_states = hidden_states.gather(-2, start_positions).squeeze(-2)  # shape (bsz, hsz)
        if cls_index is not None:
            cls_index = cls_index[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
            cls_token_state = hidden_states.gather(-2, cls_index).squeeze(-2)  # shape (bsz, hsz)
        else:
            cls_token_state = hidden_states[:, -1, :]  # shape (bsz, hsz)
        x = self.dense_0(torch.cat([start_states, cls_token_state], dim=-1))
        x = self.activation(x)
        x = self.dense_1(x).squeeze(-1)
        return x
@dataclass
 class SquadHeadOutput(ModelOutput):
    """
    Base class for outputs of question answering models using a [`~modeling_utils.SQuADHead`].
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned if both `start_positions` and `end_positions` are provided):
            Classification loss as the sum of start token, end token (and is_impossible if provided) classification
            losses.
        start_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
            Log probabilities for the top config.start_n_top start token possibilities (beam-search).
        start_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
            Indices for the top config.start_n_top start token possibilities (beam-search).
        end_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
            Log probabilities for the top `config.start_n_top * config.end_n_top` end token possibilities
            (beam-search).
        end_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
            Indices for the top `config.start_n_top * config.end_n_top` end token possibilities (beam-search).
        cls_logits (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
            Log probabilities for the `is_impossible` label of the answers.
    """
    loss: Optional[torch.FloatTensor] = None
    start_top_log_probs: Optional[torch.FloatTensor] = None
    start_top_index: Optional[torch.LongTensor] = None
    end_top_log_probs: Optional[torch.FloatTensor] = None
    end_top_index: Optional[torch.LongTensor] = None
    cls_logits: Optional[torch.FloatTensor] = None
    def __post_init__(self):
        logger.warning_once(
            "[DEPRECATION WARNING] `SquadHeadOutput` is deprecated and will be removed in v4.53. "
            "Please use model-specific class, e.g. `XLMSquadHeadOutput`."
        )
 class SQuADHead(nn.Module):
    r"""
    A SQuAD head inspired by XLNet.
    Args:
        config ([`PretrainedConfig`]):
            The config used by the model, will be used to grab the `hidden_size` of the model and the `layer_norm_eps`
            to use.
    """
    def __init__(self, config):
        super().__init__()
        self.start_n_top = config.start_n_top
        self.end_n_top = config.end_n_top
        self.start_logits = PoolerStartLogits(config)
        self.end_logits = PoolerEndLogits(config)
        self.answer_class = PoolerAnswerClass(config)
        logger.warning_once(
            "[DEPRECATION WARNING] `SQuADHead` is deprecated and will be removed in v4.53. "
            "Please use model-specific class, e.g. `XLMSQuADHead`."
        )
    @replace_return_docstrings(output_type=SquadHeadOutput, config_class=PretrainedConfig)
    def forward(
        self,
        hidden_states: torch.FloatTensor,
        start_positions: Optional[torch.LongTensor] = None,
        end_positions: Optional[torch.LongTensor] = None,
        cls_index: Optional[torch.LongTensor] = None,
        is_impossible: Optional[torch.LongTensor] = None,
        p_mask: Optional[torch.FloatTensor] = None,
        return_dict: bool = False,
    ) -> Union[SquadHeadOutput, tuple[torch.FloatTensor]]:
        """
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
                Final hidden states of the model on the sequence tokens.
            start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                Positions of the first token for the labeled span.
            end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                Positions of the last token for the labeled span.
            cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                Position of the CLS token for each sentence in the batch. If `None`, takes the last token.
            is_impossible (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                Whether the question has a possible answer in the paragraph or not.
            p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
                should be masked.
            return_dict (`bool`, *optional*, defaults to `False`):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Returns:
        """
        start_logits = self.start_logits(hidden_states, p_mask=p_mask)
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, let's remove the dimension added by batch splitting
            for x in (start_positions, end_positions, cls_index, is_impossible):
                if x is not None and x.dim() > 1:
                    x.squeeze_(-1)
            # during training, compute the end logits based on the ground truth of the start position
            end_logits = self.end_logits(hidden_states, start_positions=start_positions, p_mask=p_mask)
            loss_fct = CrossEntropyLoss()
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2
            if cls_index is not None and is_impossible is not None:
                # Predict answerability from the representation of CLS and START
                cls_logits = self.answer_class(hidden_states, start_positions=start_positions, cls_index=cls_index)
                loss_fct_cls = nn.BCEWithLogitsLoss()
                cls_loss = loss_fct_cls(cls_logits, is_impossible)
                # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss
                total_loss += cls_loss * 0.5
            return SquadHeadOutput(loss=total_loss) if return_dict else (total_loss,)
        else:
            # during inference, compute the end logits based on beam search
            bsz, slen, hsz = hidden_states.size()
            start_log_probs = nn.functional.softmax(start_logits, dim=-1)  # shape (bsz, slen)
            start_top_log_probs, start_top_index = torch.topk(
                start_log_probs, self.start_n_top, dim=-1
            )  # shape (bsz, start_n_top)
            start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz)  # shape (bsz, start_n_top, hsz)
            start_states = torch.gather(hidden_states, -2, start_top_index_exp)  # shape (bsz, start_n_top, hsz)
            start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1)  # shape (bsz, slen, start_n_top, hsz)
            hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(
                start_states
            )  # shape (bsz, slen, start_n_top, hsz)
            p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None
            end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask)
            end_log_probs = nn.functional.softmax(end_logits, dim=1)  # shape (bsz, slen, start_n_top)
            end_top_log_probs, end_top_index = torch.topk(
                end_log_probs, self.end_n_top, dim=1
            )  # shape (bsz, end_n_top, start_n_top)
            end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top)
            end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top)
            start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs)
            cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index)
            if not return_dict:
                return (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits)
            else:
                return SquadHeadOutput(
                    start_top_log_probs=start_top_log_probs,
                    start_top_index=start_top_index,
                    end_top_log_probs=end_top_log_probs,
                    end_top_index=end_top_index,
                    cls_logits=cls_logits,
                )
 class SequenceSummary(nn.Module):
    r"""
    Compute a single vector summary of a sequence hidden states.
    Args:
        config ([`PretrainedConfig`]):
            The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
            config class of your model for the default values it uses):
            - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:
                - `"last"` -- Take the last token hidden state (like XLNet)
                - `"first"` -- Take the first token hidden state (like Bert)
                - `"mean"` -- Take the mean of all tokens hidden states
                - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
                - `"attn"` -- Not implemented now, use multi-head attention
            - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
            - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
              (otherwise to `config.hidden_size`).
            - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
              another string or `None` will add no activation.
            - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
            - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
    """
    def __init__(self, config: PretrainedConfig):
        super().__init__()
        self.summary_type = getattr(config, "summary_type", "last")
        if self.summary_type == "attn":
            # We should use a standard multi-head attention module with absolute positional embedding for that.
            # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
            # We can probably just use the multi-head attention module of PyTorch >=1.1.0
            raise NotImplementedError
        self.summary = Identity()
        if hasattr(config, "summary_use_proj") and config.summary_use_proj:
            if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0:
                num_classes = config.num_labels
            else:
                num_classes = config.hidden_size
            self.summary = nn.Linear(config.hidden_size, num_classes)
        activation_string = getattr(config, "summary_activation", None)
        self.activation: Callable = get_activation(activation_string) if activation_string else Identity()
        self.first_dropout = Identity()
        if hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0:
            self.first_dropout = nn.Dropout(config.summary_first_dropout)
        self.last_dropout = Identity()
        if hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0:
            self.last_dropout = nn.Dropout(config.summary_last_dropout)
        logger.warning_once(
            "[DEPRECATION WARNING] `SequenceSummary` is deprecated and will be removed in v4.53. "
            "Please use model-specific class, e.g. `XLMSequenceSummary`."
        )
    def forward(
        self, hidden_states: torch.FloatTensor, cls_index: Optional[torch.LongTensor] = None
    ) -> torch.FloatTensor:
        """
        Compute a single vector summary of a sequence hidden states.
        Args:
            hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`):
                The hidden states of the last layer.
            cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
                Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token.
        Returns:
            `torch.FloatTensor`: The summary of the sequence hidden states.
        """
        if self.summary_type == "last":
            output = hidden_states[:, -1]
        elif self.summary_type == "first":
            output = hidden_states[:, 0]
        elif self.summary_type == "mean":
            output = hidden_states.mean(dim=1)
        elif self.summary_type == "cls_index":
            if cls_index is None:
                cls_index = torch.full_like(
                    hidden_states[..., :1, :],
                    hidden_states.shape[-2] - 1,
                    dtype=torch.long,
                )
            else:
                cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)
                cls_index = cls_index.expand((-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),))
            # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
            output = hidden_states.gather(-2, cls_index).squeeze(-2)  # shape (bsz, XX, hidden_size)
        elif self.summary_type == "attn":
            raise NotImplementedError
        output = self.first_dropout(output)
        output = self.summary(output)
        output = self.activation(output)
        output = self.last_dropout(output)
        return output
 def unwrap_model(model: nn.Module, recursive: bool = False) -> nn.Module:
    """
    Recursively unwraps a model from potential containers (as used in distributed training).
--- a/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
+++ b/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
@ -206,7 +206,7 @@ def convert_audio_spectrogram_transformer_checkpoint(model_name, pytorch_dump_fo
    if "speech-commands" in model_name:
        # TODO: Convert dataset to Parquet
-        dataset = load_dataset("google/speech_commands", "v0.02", split="validation")
+        dataset = load_dataset("google/speech_commands", "v0.02", split="validation", trust_remote_code=True)
        waveform = dataset[0]["audio"]["array"]
    else:
        filepath = hf_hub_download(
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@ -245,10 +245,6 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
        ("gpt_neox_japanese", ("GPTNeoXJapaneseTokenizer", None)),
        ("gptj", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
        ("gptsan-japanese", ("GPTSanJapaneseTokenizer", None)),
        ("granite", ("GPT2Tokenizer", None)),
        ("granitemoe", ("GPT2Tokenizer", None)),
        ("granitemoehybrid", ("GPT2Tokenizer", None)),
        ("granitemoeshared", ("GPT2Tokenizer", None)),
        ("grounding-dino", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
        ("groupvit", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
        ("helium", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
--- a/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py
+++ b/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py
@ -266,7 +266,7 @@ def convert_beit_checkpoint(checkpoint_url, pytorch_dump_folder_path):
    # Check outputs on an image
    if is_semantic:
        image_processor = BeitImageProcessor(size=config.image_size, do_center_crop=False)
-        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
        image = Image.open(ds[0]["file"])
    else:
        image_processor = BeitImageProcessor(
--- a/src/transformers/models/clvp/number_normalizer.py
+++ b/src/transformers/models/clvp/number_normalizer.py
@ -15,14 +15,7 @@
 """English Normalizer class for CLVP."""
-import sys
+import re
 if sys.version_info >= (3, 11):
    # Atomic grouping support was only added to the core RE in Python 3.11
    import re
 else:
    import regex as re
 class EnglishNormalizer:
@ -206,12 +199,12 @@ class EnglishNormalizer:
        This method is used to normalize numbers within a text such as converting the numbers to words, removing
        commas, etc.
        """
-        text = re.sub(r"([0-9][0-9,]+[0-9])", self._remove_commas, text)
+        text = re.sub(re.compile(r"([0-9][0-9\,]+[0-9])"), self._remove_commas, text)
-        text = re.sub(r"£([0-9,]*[0-9])", r"\1 pounds", text)
+        text = re.sub(re.compile(r"£([0-9\,]*[0-9]+)"), r"\1 pounds", text)
-        text = re.sub(r"\$([0-9.,]*[0-9])", self._expand_dollars, text)
+        text = re.sub(re.compile(r"\$([0-9\.\,]*[0-9]+)"), self._expand_dollars, text)
-        text = re.sub(r"([0-9]++\.[0-9]+)", self._expand_decimal_point, text)
+        text = re.sub(re.compile(r"([0-9]+\.[0-9]+)"), self._expand_decimal_point, text)
-        text = re.sub(r"[0-9]++(st|nd|rd|th)", self._expand_ordinal, text)
+        text = re.sub(re.compile(r"[0-9]+(st|nd|rd|th)"), self._expand_ordinal, text)
-        text = re.sub(r"[0-9]+", self._expand_number, text)
+        text = re.sub(re.compile(r"[0-9]+"), self._expand_number, text)
        return text
    def expand_abbreviations(self, text: str) -> str:
--- a/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
@ -226,7 +226,7 @@ def convert_wav2vec2_checkpoint(
    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-lv60")
-    ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+    ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
    input_audio = [x["array"] for x in ds[:4]["audio"]]
    inputs = processor(input_audio, return_tensors="pt", padding=True)
--- a/src/transformers/models/layoutlm/modeling_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_layoutlm.py
@ -1223,7 +1223,7 @@ class LayoutLMForQuestionAnswering(LayoutLMPreTrainedModel):
        >>> tokenizer = AutoTokenizer.from_pretrained("impira/layoutlm-document-qa", add_prefix_space=True)
        >>> model = LayoutLMForQuestionAnswering.from_pretrained("impira/layoutlm-document-qa", revision="1e3ebac")
-        >>> dataset = load_dataset("nielsr/funsd", split="train")
+        >>> dataset = load_dataset("nielsr/funsd", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> question = "what's his name?"
        >>> words = example["words"]
--- a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
@ -1601,7 +1601,7 @@ class TFLayoutLMForQuestionAnswering(TFLayoutLMPreTrainedModel, TFQuestionAnswer
        >>> tokenizer = AutoTokenizer.from_pretrained("impira/layoutlm-document-qa", add_prefix_space=True)
        >>> model = TFLayoutLMForQuestionAnswering.from_pretrained("impira/layoutlm-document-qa", revision="1e3ebac")
-        >>> dataset = load_dataset("nielsr/funsd", split="train")
+        >>> dataset = load_dataset("nielsr/funsd", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> question = "what's his name?"
        >>> words = example["words"]
--- a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
@ -763,8 +763,9 @@ class LayoutLMv2Model(LayoutLMv2PreTrainedModel):
        >>> model = LayoutLMv2Model.from_pretrained("microsoft/layoutlmv2-base-uncased")
-        >>> dataset = load_dataset("hf-internal-testing/fixtures_docvqa")
+        >>> dataset = load_dataset("hf-internal-testing/fixtures_docvqa", trust_remote_code=True)
-        >>> image = dataset["test"][0]["image"]
+        >>> image_path = dataset["test"][0]["file"]
        >>> image = Image.open(image_path).convert("RGB")
        >>> encoding = processor(image, return_tensors="pt")
@ -952,7 +953,7 @@ class LayoutLMv2ForSequenceClassification(LayoutLMv2PreTrainedModel):
        >>> set_seed(0)
-        >>> dataset = load_dataset("aharley/rvl_cdip", split="train", streaming=True)
+        >>> dataset = load_dataset("aharley/rvl_cdip", split="train", streaming=True, trust_remote_code=True)
        >>> data = next(iter(dataset))
        >>> image = data["image"].convert("RGB")
@ -1154,7 +1155,7 @@ class LayoutLMv2ForTokenClassification(LayoutLMv2PreTrainedModel):
        >>> set_seed(0)
-        >>> datasets = load_dataset("nielsr/funsd", split="test")
+        >>> datasets = load_dataset("nielsr/funsd", split="test", trust_remote_code=True)
        >>> labels = datasets.features["ner_tags"].feature.names
        >>> id2label = {v: k for v, k in enumerate(labels)}
@ -1311,8 +1312,9 @@ class LayoutLMv2ForQuestionAnswering(LayoutLMv2PreTrainedModel):
        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased")
        >>> model = LayoutLMv2ForQuestionAnswering.from_pretrained("microsoft/layoutlmv2-base-uncased")
-        >>> dataset = load_dataset("hf-internal-testing/fixtures_docvqa")
+        >>> dataset = load_dataset("hf-internal-testing/fixtures_docvqa", trust_remote_code=True)
-        >>> image = dataset["test"][0]["image"]
+        >>> image_path = dataset["test"][0]["file"]
        >>> image = Image.open(image_path).convert("RGB")
        >>> question = "When is coffee break?"
        >>> encoding = processor(image, question, return_tensors="pt")
--- a/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py
@ -746,7 +746,7 @@ class LayoutLMv3Model(LayoutLMv3PreTrainedModel):
        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = AutoModel.from_pretrained("microsoft/layoutlmv3-base")
-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
@ -961,7 +961,7 @@ class LayoutLMv3ForTokenClassification(LayoutLMv3PreTrainedModel):
        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = AutoModelForTokenClassification.from_pretrained("microsoft/layoutlmv3-base", num_labels=7)
-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
@ -1062,7 +1062,7 @@ class LayoutLMv3ForQuestionAnswering(LayoutLMv3PreTrainedModel):
        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = AutoModelForQuestionAnswering.from_pretrained("microsoft/layoutlmv3-base")
-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> question = "what's his name?"
@ -1182,7 +1182,7 @@ class LayoutLMv3ForSequenceClassification(LayoutLMv3PreTrainedModel):
        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = AutoModelForSequenceClassification.from_pretrained("microsoft/layoutlmv3-base")
-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
--- a/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py
@ -1296,7 +1296,7 @@ class TFLayoutLMv3Model(TFLayoutLMv3PreTrainedModel):
        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = TFAutoModel.from_pretrained("microsoft/layoutlmv3-base")
-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
@ -1439,7 +1439,7 @@ class TFLayoutLMv3ForSequenceClassification(TFLayoutLMv3PreTrainedModel, TFSeque
        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = TFAutoModelForSequenceClassification.from_pretrained("microsoft/layoutlmv3-base")
-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
@ -1566,7 +1566,7 @@ class TFLayoutLMv3ForTokenClassification(TFLayoutLMv3PreTrainedModel, TFTokenCla
        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = TFAutoModelForTokenClassification.from_pretrained("microsoft/layoutlmv3-base", num_labels=7)
-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
@ -1703,7 +1703,7 @@ class TFLayoutLMv3ForQuestionAnswering(TFLayoutLMv3PreTrainedModel, TFQuestionAn
        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = TFAutoModelForQuestionAnswering.from_pretrained("microsoft/layoutlmv3-base")
-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> question = "what's his name?"
--- a/src/transformers/models/lilt/modeling_lilt.py
+++ b/src/transformers/models/lilt/modeling_lilt.py
@ -653,7 +653,7 @@ class LiltModel(LiltPreTrainedModel):
        >>> tokenizer = AutoTokenizer.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")
        >>> model = AutoModel.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")
-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]
@ -793,7 +793,7 @@ class LiltForSequenceClassification(LiltPreTrainedModel):
        >>> tokenizer = AutoTokenizer.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")
        >>> model = AutoModelForSequenceClassification.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")
-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]
@ -908,7 +908,7 @@ class LiltForTokenClassification(LiltPreTrainedModel):
        >>> tokenizer = AutoTokenizer.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")
        >>> model = AutoModelForTokenClassification.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")
-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]
@ -1025,7 +1025,7 @@ class LiltForQuestionAnswering(LiltPreTrainedModel):
        >>> tokenizer = AutoTokenizer.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")
        >>> model = AutoModelForQuestionAnswering.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")
-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]
--- a/src/transformers/models/speecht5/modeling_speecht5.py
+++ b/src/transformers/models/speecht5/modeling_speecht5.py
@ -2228,7 +2228,7 @@ class SpeechT5ForSpeechToText(SpeechT5PreTrainedModel, GenerationMixin):
        >>> from datasets import load_dataset
        >>> dataset = load_dataset(
-        ...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation"
+        ...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True
        ... )  # doctest: +IGNORE_RESULT
        >>> dataset = dataset.sort("id")
        >>> sampling_rate = dataset.features["audio"].sampling_rate
@ -2909,7 +2909,7 @@ class SpeechT5ForSpeechToSpeech(SpeechT5PreTrainedModel):
        >>> import torch
        >>> dataset = load_dataset(
-        ...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation"
+        ...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True
        ... )  # doctest: +IGNORE_RESULT
        >>> dataset = dataset.sort("id")
        >>> sampling_rate = dataset.features["audio"].sampling_rate
--- a/src/transformers/models/udop/modeling_udop.py
+++ b/src/transformers/models/udop/modeling_udop.py
@ -1604,7 +1604,7 @@ class UdopModel(UdopPreTrainedModel):
        >>> # load an example image, along with the words and coordinates
        >>> # which were extracted using an OCR engine
-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
@ -1813,7 +1813,7 @@ class UdopForConditionalGeneration(UdopPreTrainedModel, GenerationMixin):
        >>> # load an example image, along with the words and coordinates
        >>> # which were extracted using an OCR engine
-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
@ -2025,7 +2025,7 @@ class UdopEncoderModel(UdopPreTrainedModel):
        >>> # load an example image, along with the words and coordinates
        >>> # which were extracted using an OCR engine
-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
--- a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
@ -590,7 +590,7 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):
        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
        >>> # load first sample of English common_voice
-        >>> dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", split="train", streaming=True)
+        >>> dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", split="train", streaming=True, trust_remote_code=True)
        >>> dataset = dataset.cast_column("audio", datasets.Audio(sampling_rate=16_000))
        >>> dataset_iter = iter(dataset)
        >>> sample = next(dataset_iter)
--- a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
+++ b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
@ -546,7 +546,7 @@ class Wav2Vec2ProcessorWithLM(ProcessorMixin):
        >>> processor = AutoProcessor.from_pretrained("patrickvonplaten/wav2vec2-base-100h-with-lm")
        >>> # load first sample of English common_voice
-        >>> dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", split="train", streaming=True)
+        >>> dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", split="train", streaming=True, trust_remote_code=True)
        >>> dataset = dataset.cast_column("audio", datasets.Audio(sampling_rate=16_000))
        >>> dataset_iter = iter(dataset)
        >>> sample = next(dataset_iter)
--- a/src/transformers/models/whisper/modeling_flax_whisper.py
+++ b/src/transformers/models/whisper/modeling_flax_whisper.py
@ -1670,7 +1670,7 @@ FLAX_WHISPER_AUDIO_CLASSIFICATION_DOCSTRING = r"""
    >>> model = FlaxWhisperForAudioClassification.from_pretrained(
    ...     "sanchit-gandhi/whisper-medium-fleurs-lang-id", from_pt=True
    ... )
-    >>> ds = load_dataset("google/fleurs", "all", split="validation", streaming=True)
+    >>> ds = load_dataset("google/fleurs", "all", split="validation", streaming=True, trust_remote_code=True)
    >>> sample = next(iter(ds))
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@ -526,15 +526,12 @@ class Trainer:
            if is_liger_kernel_available():
                from liger_kernel.transformers import _apply_liger_kernel_to_instance
                # Prepare kernel config - use provided config or default (empty dict for default behavior)
                kernel_config = self.args.liger_kernel_config if self.args.liger_kernel_config is not None else {}
                if isinstance(model, PreTrainedModel):
-                    # Patch the model with liger kernels. Use the the specified or default kernel configurations.
+                    # Patch the model with liger kernels. Use the default kernel configurations.
-                    _apply_liger_kernel_to_instance(model=model, **kernel_config)
+                    _apply_liger_kernel_to_instance(model=model)
                elif hasattr(model, "get_base_model") and isinstance(model.get_base_model(), PreTrainedModel):
-                    # Patch the base model with liger kernels where model is a PeftModel. Use the specified or default kernel configurations.
+                    # Patch the base model with liger kernels where model is a PeftModel. Use the default kernel configurations.
-                    _apply_liger_kernel_to_instance(model=model.get_base_model(), **kernel_config)
+                    _apply_liger_kernel_to_instance(model=model.get_base_model())
                else:
                    logger.warning(
                        "The model is not an instance of PreTrainedModel. No liger kernels will be applied."
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@ -793,11 +793,6 @@ class TrainingArguments:
            It can effectively increase multi-GPU training throughput by ~20% and reduces memory usage by ~60%, works out of the box with
            flash attention, PyTorch FSDP, and Microsoft DeepSpeed. Currently, it supports llama, mistral, mixtral and gemma models.
        liger_kernel_config (`Optional[dict]`, *optional*):
            Configuration to be used for Liger Kernel. When use_liger_kernel=True, this dict is passed as keyword arguments to the
            `_apply_liger_kernel_to_instance` function, which specifies which kernels to apply. Available options vary by model but typically
            include: 'rope', 'swiglu', 'cross_entropy', 'fused_linear_cross_entropy', 'rms_norm', etc. If `None`, use the default kernel configurations.
        average_tokens_across_devices (`bool`, *optional*, defaults to `False`):
            Whether or not to average tokens across devices. If enabled, will use all_reduce to synchronize
            num_tokens_in_batch for precise loss calculation. Reference:
@ -1530,19 +1525,6 @@ class TrainingArguments:
        metadata={"help": "Whether or not to enable the Liger Kernel for model training."},
    )
    liger_kernel_config: Optional[dict[str, bool]] = field(
        default=None,
        metadata={
            "help": (
                "Configuration to be used for Liger Kernel. When use_liger_kernel=True, "
                "this dict is passed as keyword arguments to the `_apply_liger_kernel_to_instance` function, "
                "which specifies which kernels to apply. Available options vary by model "
                "but typically include: 'rope', 'swiglu', 'cross_entropy', 'fused_linear_cross_entropy', "
                "'rms_norm', etc. If None, use the default kernel configurations."
            )
        },
    )
    eval_use_gather_object: Optional[bool] = field(
        default=False,
        metadata={
--- a/src/transformers/utils/doc.py
+++ b/src/transformers/utils/doc.py
@ -423,7 +423,7 @@ PT_SPEECH_BASE_MODEL_SAMPLE = r"""
    >>> import torch
    >>> from datasets import load_dataset
-    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
    >>> dataset = dataset.sort("id")
    >>> sampling_rate = dataset.features["audio"].sampling_rate
@ -449,7 +449,7 @@ PT_SPEECH_CTC_SAMPLE = r"""
    >>> from datasets import load_dataset
    >>> import torch
-    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
    >>> dataset = dataset.sort("id")
    >>> sampling_rate = dataset.features["audio"].sampling_rate
@ -484,7 +484,7 @@ PT_SPEECH_SEQ_CLASS_SAMPLE = r"""
    >>> from datasets import load_dataset
    >>> import torch
-    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
    >>> dataset = dataset.sort("id")
    >>> sampling_rate = dataset.features["audio"].sampling_rate
@ -520,7 +520,7 @@ PT_SPEECH_FRAME_CLASS_SAMPLE = r"""
    >>> from datasets import load_dataset
    >>> import torch
-    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
    >>> dataset = dataset.sort("id")
    >>> sampling_rate = dataset.features["audio"].sampling_rate
@ -549,7 +549,7 @@ PT_SPEECH_XVECTOR_SAMPLE = r"""
    >>> from datasets import load_dataset
    >>> import torch
-    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
    >>> dataset = dataset.sort("id")
    >>> sampling_rate = dataset.features["audio"].sampling_rate
@ -584,7 +584,7 @@ PT_VISION_BASE_MODEL_SAMPLE = r"""
    >>> import torch
    >>> from datasets import load_dataset
-    >>> dataset = load_dataset("huggingface/cats-image")
+    >>> dataset = load_dataset("huggingface/cats-image", trust_remote_code=True)
    >>> image = dataset["test"]["image"][0]
    >>> image_processor = AutoImageProcessor.from_pretrained("{checkpoint}")
@ -609,7 +609,7 @@ PT_VISION_SEQ_CLASS_SAMPLE = r"""
    >>> import torch
    >>> from datasets import load_dataset
-    >>> dataset = load_dataset("huggingface/cats-image")
+    >>> dataset = load_dataset("huggingface/cats-image", trust_remote_code=True)
    >>> image = dataset["test"]["image"][0]
    >>> image_processor = AutoImageProcessor.from_pretrained("{checkpoint}")
@ -1194,7 +1194,7 @@ TF_SPEECH_BASE_MODEL_SAMPLE = r"""
    >>> from transformers import AutoProcessor, {model_class}
    >>> from datasets import load_dataset
-    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
    >>> dataset = dataset.sort("id")
    >>> sampling_rate = dataset.features["audio"].sampling_rate
@ -1219,7 +1219,7 @@ TF_SPEECH_CTC_SAMPLE = r"""
    >>> from datasets import load_dataset
    >>> import tensorflow as tf
-    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
    >>> dataset = dataset.sort("id")
    >>> sampling_rate = dataset.features["audio"].sampling_rate
@ -1254,7 +1254,7 @@ TF_VISION_BASE_MODEL_SAMPLE = r"""
    >>> from transformers import AutoImageProcessor, {model_class}
    >>> from datasets import load_dataset
-    >>> dataset = load_dataset("huggingface/cats-image")
+    >>> dataset = load_dataset("huggingface/cats-image", trust_remote_code=True)
    >>> image = dataset["test"]["image"][0]
    >>> image_processor = AutoImageProcessor.from_pretrained("{checkpoint}")
@ -1277,7 +1277,7 @@ TF_VISION_SEQ_CLASS_SAMPLE = r"""
    >>> import tensorflow as tf
    >>> from datasets import load_dataset
-    >>> dataset = load_dataset("huggingface/cats-image"))
+    >>> dataset = load_dataset("huggingface/cats-image", trust_remote_code=True)
    >>> image = dataset["test"]["image"][0]
    >>> image_processor = AutoImageProcessor.from_pretrained("{checkpoint}")
--- a/tests/deepspeed/test_model_zoo.py
+++ b/tests/deepspeed/test_model_zoo.py
@ -269,6 +269,7 @@ def make_task_cmds():
        "img_clas": f"""
        {scripts_dir}/image-classification/run_image_classification.py
            --dataset_name hf-internal-testing/cats_vs_dogs_sample
            --trust_remote_code
            --remove_unused_columns False
            --max_steps 10
            --image_processor_name {DS_TESTS_DIRECTORY}/vit_feature_extractor.json
--- a/tests/models/beit/test_image_processing_beit.py
+++ b/tests/models/beit/test_image_processing_beit.py
@ -27,6 +27,8 @@ if is_torch_available():
    import torch
 if is_vision_available():
    from PIL import Image
    from transformers import BeitImageProcessor
    if is_torchvision_available():
@ -96,14 +98,23 @@ class BeitImageProcessingTester:
 def prepare_semantic_single_inputs():
-    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+    dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-    example = ds[0]
+
-    return example["image"], example["map"]
+    image = Image.open(dataset[0]["file"])
    map = Image.open(dataset[1]["file"])
    return image, map
 def prepare_semantic_batch_inputs():
-    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-    return list(ds["image"][:2]), list(ds["map"][:2])
+
    image1 = Image.open(ds[0]["file"])
    map1 = Image.open(ds[1]["file"])
    image2 = Image.open(ds[2]["file"])
    map2 = Image.open(ds[3]["file"])
    return [image1, image2], [map1, map2]
@require_torch
--- a/tests/models/beit/test_modeling_beit.py
+++ b/tests/models/beit/test_modeling_beit.py
@ -504,7 +504,7 @@ class BeitModelIntegrationTest(unittest.TestCase):
        image_processor = BeitImageProcessor(do_resize=True, size=640, do_center_crop=False)
-        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
        image = Image.open(ds[0]["file"])
        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
@ -547,7 +547,7 @@ class BeitModelIntegrationTest(unittest.TestCase):
        image_processor = BeitImageProcessor(do_resize=True, size=640, do_center_crop=False)
-        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
        image = Image.open(ds[0]["file"])
        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
--- a/tests/models/data2vec/test_modeling_data2vec_audio.py
+++ b/tests/models/data2vec/test_modeling_data2vec_audio.py
@ -669,7 +669,7 @@ class Data2VecAudioModelIntegrationTest(unittest.TestCase):
        return [x["array"] for x in speech_samples]
    def _load_superb(self, task, num_samples):
-        ds = load_dataset("anton-l/superb_dummy", task, split="test")
+        ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True)
        return ds[:num_samples]
--- a/tests/models/dpt/test_image_processing_dpt.py
+++ b/tests/models/dpt/test_image_processing_dpt.py
@ -29,6 +29,8 @@ if is_torch_available():
    import torch
 if is_vision_available():
    from PIL import Image
    from transformers import DPTImageProcessor
    if is_torchvision_available():
@ -92,15 +94,24 @@ class DPTImageProcessingTester:
 # Copied from transformers.tests.models.beit.test_image_processing_beit.prepare_semantic_single_inputs
 def prepare_semantic_single_inputs():
-    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+    dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-    example = ds[0]
+
-    return example["image"], example["map"]
+    image = Image.open(dataset[0]["file"])
    map = Image.open(dataset[1]["file"])
    return image, map
 # Copied from transformers.tests.models.beit.test_image_processing_beit.prepare_semantic_batch_inputs
 def prepare_semantic_batch_inputs():
-    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-    return list(ds["image"][:2]), list(ds["map"][:2])
+
    image1 = Image.open(ds[0]["file"])
    map1 = Image.open(ds[1]["file"])
    image2 = Image.open(ds[2]["file"])
    map2 = Image.open(ds[3]["file"])
    return [image1, image2], [map1, map2]
@require_torch
--- a/tests/models/hubert/test_modeling_hubert.py
+++ b/tests/models/hubert/test_modeling_hubert.py
@ -767,7 +767,7 @@ class HubertModelIntegrationTest(unittest.TestCase):
    def _load_superb(self, task, num_samples):
        from datasets import load_dataset
-        ds = load_dataset("anton-l/superb_dummy", task, split="test")
+        ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True)
        return ds[:num_samples]
--- a/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py
@ -111,13 +111,13 @@ class LayoutLMv2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
    def test_layoutlmv2_integration_test(self):
        from datasets import load_dataset
-        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
+        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True)
        for image_processing_class in self.image_processor_list:
            # with apply_OCR = True
            image_processing = image_processing_class()
-            image = ds[0]["image"]
+            image = Image.open(ds[0]["file"]).convert("RGB")
            encoding = image_processing(image, return_tensors="pt")
--- a/tests/models/layoutlmv2/test_processor_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_processor_layoutlmv2.py
@ -156,7 +156,7 @@ class LayoutLMv2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        from datasets import load_dataset
        # set up
-        datasets = load_dataset("nielsr/funsd")
+        datasets = load_dataset("nielsr/funsd", trust_remote_code=True)
        processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", revision="no_ocr")
        def preprocess_data(examples):
@ -192,8 +192,12 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
        # we verify our implementation on 2 document images from the DocVQA dataset
        from datasets import load_dataset
-        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
+        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True)
-        return ds[0]["image"], ds[1]["image"]
+
        image_1 = Image.open(ds[0]["file"]).convert("RGB")
        image_2 = Image.open(ds[1]["file"]).convert("RGB")
        return image_1, image_2
    @cached_property
    def get_tokenizers(self):
--- a/tests/models/layoutlmv3/test_image_processing_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_image_processing_layoutlmv3.py
@ -22,6 +22,8 @@ from ...test_image_processing_common import ImageProcessingTestMixin, prepare_im
 if is_pytesseract_available():
    from PIL import Image
    from transformers import LayoutLMv3ImageProcessor
    if is_torchvision_available():
@ -104,13 +106,13 @@ class LayoutLMv3ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
    def test_LayoutLMv3_integration_test(self):
        from datasets import load_dataset
-        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
+        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True)
        # with apply_OCR = True
        for image_processing_class in self.image_processor_list:
            image_processor = image_processing_class()
-            image = ds[0]["image"].convert("RGB")
+            image = Image.open(ds[0]["file"]).convert("RGB")
            encoding = image_processor(image, return_tensors="pt")
--- a/tests/models/layoutlmv3/test_processor_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_processor_layoutlmv3.py
@ -28,6 +28,8 @@ from ...test_processing_common import ProcessorTesterMixin
 if is_pytesseract_available():
    from PIL import Image
    from transformers import LayoutLMv3ImageProcessor
@ -170,8 +172,12 @@ class LayoutLMv3ProcessorIntegrationTests(unittest.TestCase):
        # we verify our implementation on 2 document images from the DocVQA dataset
        from datasets import load_dataset
-        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
+        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True)
-        return ds[0]["image"], ds[1]["image"]
+
        image_1 = Image.open(ds[0]["file"]).convert("RGB")
        image_2 = Image.open(ds[1]["file"]).convert("RGB")
        return image_1, image_2
    @cached_property
    def get_tokenizers(self):
--- a/tests/models/layoutxlm/test_processor_layoutxlm.py
+++ b/tests/models/layoutxlm/test_processor_layoutxlm.py
@ -162,7 +162,7 @@ class LayoutXLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        from datasets import load_dataset
        # set up
-        datasets = load_dataset("nielsr/funsd")
+        datasets = load_dataset("nielsr/funsd", trust_remote_code=True)
        processor = LayoutXLMProcessor.from_pretrained("microsoft/layoutxlm-base", apply_ocr=False)
        def preprocess_data(examples):
@ -200,8 +200,12 @@ class LayoutXLMProcessorIntegrationTests(unittest.TestCase):
        # we verify our implementation on 2 document images from the DocVQA dataset
        from datasets import load_dataset
-        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
+        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True)
-        return ds[0]["image"], ds[1]["image"]
+
        image_1 = Image.open(ds[0]["file"]).convert("RGB")
        image_2 = Image.open(ds[1]["file"]).convert("RGB")
        return image_1, image_2
    @cached_property
    def get_tokenizers(self):
--- a/tests/models/mobilevit/test_image_processing_mobilevit.py
+++ b/tests/models/mobilevit/test_image_processing_mobilevit.py
@ -27,6 +27,8 @@ if is_torch_available():
    import torch
 if is_vision_available():
    from PIL import Image
    from transformers import MobileViTImageProcessor
@ -84,14 +86,23 @@ class MobileViTImageProcessingTester:
 def prepare_semantic_single_inputs():
-    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+    dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-    example = ds[0]
+
-    return example["image"], example["map"]
+    image = Image.open(dataset[0]["file"])
    map = Image.open(dataset[1]["file"])
    return image, map
 def prepare_semantic_batch_inputs():
-    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+    dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-    return list(ds["image"][:2]), list(ds["map"][:2])
+
    image1 = Image.open(dataset[0]["file"])
    map1 = Image.open(dataset[1]["file"])
    image2 = Image.open(dataset[2]["file"])
    map2 = Image.open(dataset[3]["file"])
    return [image1, image2], [map1, map2]
@require_torch
--- a/tests/models/nougat/test_image_processing_nougat.py
+++ b/tests/models/nougat/test_image_processing_nougat.py
@ -86,12 +86,8 @@ class NougatImageProcessingTester:
        return self.num_channels, self.size["height"], self.size["width"]
    def prepare_dummy_image(self):
        revision = "ec57bf8c8b1653a209c13f6e9ee66b12df0fc2db"
        filepath = hf_hub_download(
-            repo_id="hf-internal-testing/fixtures_docvqa",
+            repo_id="hf-internal-testing/fixtures_docvqa", filename="nougat_pdf.png", repo_type="dataset"
            filename="nougat_pdf.png",
            repo_type="dataset",
            revision=revision,
        )
        image = Image.open(filepath).convert("RGB")
        return image
@ -183,12 +179,8 @@ class NougatImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
        self.assertEqual((3, 100, 200), aligned_image.shape)
    def prepare_dummy_np_image(self):
        revision = "ec57bf8c8b1653a209c13f6e9ee66b12df0fc2db"
        filepath = hf_hub_download(
-            repo_id="hf-internal-testing/fixtures_docvqa",
+            repo_id="hf-internal-testing/fixtures_docvqa", filename="nougat_pdf.png", repo_type="dataset"
            filename="nougat_pdf.png",
            repo_type="dataset",
            revision=revision,
        )
        image = Image.open(filepath).convert("RGB")
        return np.array(image)
--- a/tests/models/perceiver/test_modeling_perceiver.py
+++ b/tests/models/perceiver/test_modeling_perceiver.py
@ -842,8 +842,11 @@ def prepare_img():
 # Helper functions for optical flow integration test
 def prepare_optical_flow_images():
-    ds = load_dataset("hf-internal-testing/fixtures_sintel", split="test")
+    dataset = load_dataset("hf-internal-testing/fixtures_sintel", split="test", trust_remote_code=True)
-    return list(ds["image"][:2])
+    image1 = Image.open(dataset[0]["file"]).convert("RGB")
    image2 = Image.open(dataset[0]["file"]).convert("RGB")
    return image1, image2
 def normalize(img):
--- a/tests/models/segformer/test_image_processing_segformer.py
+++ b/tests/models/segformer/test_image_processing_segformer.py
@ -27,6 +27,8 @@ if is_torch_available():
    import torch
 if is_vision_available():
    from PIL import Image
    from transformers import SegformerImageProcessor
@ -84,14 +86,23 @@ class SegformerImageProcessingTester:
 def prepare_semantic_single_inputs():
-    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+    dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-    example = ds[0]
+
-    return example["image"], example["map"]
+    image = Image.open(dataset[0]["file"])
    map = Image.open(dataset[1]["file"])
    return image, map
 def prepare_semantic_batch_inputs():
-    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+    dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-    return list(ds["image"][:2]), list(ds["map"][:2])
+
    image1 = Image.open(dataset[0]["file"])
    map1 = Image.open(dataset[1]["file"])
    image2 = Image.open(dataset[2]["file"])
    map2 = Image.open(dataset[3]["file"])
    return [image1, image2], [map1, map2]
@require_torch
--- a/tests/models/udop/test_processor_udop.py
+++ b/tests/models/udop/test_processor_udop.py
@ -184,7 +184,7 @@ class UdopProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        from datasets import load_dataset
        # set up
-        datasets = load_dataset("nielsr/funsd")
+        datasets = load_dataset("nielsr/funsd", trust_remote_code=True)
        processor = UdopProcessor.from_pretrained("microsoft/udop-large", apply_ocr=False)
        def preprocess_data(examples):
@ -222,8 +222,12 @@ class UdopProcessorIntegrationTests(unittest.TestCase):
        # we verify our implementation on 2 document images from the DocVQA dataset
        from datasets import load_dataset
-        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
+        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True)
-        return ds[0]["image"], ds[1]["image"]
+
        image_1 = Image.open(ds[0]["file"]).convert("RGB")
        image_2 = Image.open(ds[1]["file"]).convert("RGB")
        return image_1, image_2
    @cached_property
    def get_tokenizers(self):
--- a/tests/models/unispeech/test_modeling_unispeech.py
+++ b/tests/models/unispeech/test_modeling_unispeech.py
@ -566,7 +566,7 @@ class UniSpeechModelIntegrationTest(unittest.TestCase):
        return [x["array"] for x in speech_samples]
    def _load_superb(self, task, num_samples):
-        ds = load_dataset("anton-l/superb_dummy", task, split="test")
+        ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True)
        return ds[:num_samples]
--- a/tests/models/unispeech_sat/test_modeling_unispeech_sat.py
+++ b/tests/models/unispeech_sat/test_modeling_unispeech_sat.py
@ -820,7 +820,7 @@ class UniSpeechSatModelIntegrationTest(unittest.TestCase):
        return [x["array"] for x in speech_samples]
    def _load_superb(self, task, num_samples):
-        ds = load_dataset("anton-l/superb_dummy", task, split="test")
+        ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True)
        return ds[:num_samples]
--- a/tests/models/vilt/test_modeling_vilt.py
+++ b/tests/models/vilt/test_modeling_vilt.py
@ -637,9 +637,9 @@ class ViltModelIntegrationTest(unittest.TestCase):
        processor = self.default_processor
-        dataset = load_dataset("hf-internal-testing/fixtures_nlvr2", split="train")
+        dataset = load_dataset("hf-internal-testing/fixtures_nlvr2", split="test", trust_remote_code=True)
-        image1 = dataset[0]["image"]
+        image1 = Image.open(dataset[0]["file"]).convert("RGB")
-        image2 = dataset[1]["image"]
+        image2 = Image.open(dataset[1]["file"]).convert("RGB")
        text = (
            "The left image contains twice the number of dogs as the right image, and at least two dogs in total are"
--- a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
+++ b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
@ -1149,8 +1149,8 @@ class TrOCRModelIntegrationTest(unittest.TestCase):
    def test_inference_handwritten(self):
        model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten").to(torch_device)
-        dataset = load_dataset("hf-internal-testing/fixtures_ocr", split="train")
+        dataset = load_dataset("hf-internal-testing/fixtures_ocr", split="test", trust_remote_code=True)
-        image = dataset[0]["image"]
+        image = Image.open(dataset[0]["file"]).convert("RGB")
        processor = self.default_processor
        pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(torch_device)
@ -1174,8 +1174,8 @@ class TrOCRModelIntegrationTest(unittest.TestCase):
    def test_inference_printed(self):
        model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed").to(torch_device)
-        dataset = load_dataset("hf-internal-testing/fixtures_ocr", split="test")
+        dataset = load_dataset("hf-internal-testing/fixtures_ocr", split="test", trust_remote_code=True)
-        image = dataset[0]["image"]
+        image = Image.open(dataset[1]["file"]).convert("RGB")
        processor = self.default_processor
        pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(torch_device)
--- a/tests/models/wav2vec2/test_modeling_wav2vec2.py
+++ b/tests/models/wav2vec2/test_modeling_wav2vec2.py
@ -97,7 +97,9 @@ def _test_wav2vec2_with_lm_invalid_pool(in_queue, out_queue, timeout):
    try:
        _ = in_queue.get(timeout=timeout)
-        ds = load_dataset("mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True)
+        ds = load_dataset(
            "mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True, trust_remote_code=True
        )
        sample = next(iter(ds))
        resampled_audio = torchaudio.functional.resample(
@ -1468,7 +1470,7 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
        return [x["array"] for x in speech_samples]
    def _load_superb(self, task, num_samples):
-        ds = load_dataset("anton-l/superb_dummy", task, split="test")
+        ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True)
        return ds[:num_samples]
@ -1834,7 +1836,9 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
    @require_pyctcdecode
    @require_torchaudio
    def test_wav2vec2_with_lm(self):
-        ds = load_dataset("mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True)
+        ds = load_dataset(
            "mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True, trust_remote_code=True
        )
        sample = next(iter(ds))
        resampled_audio = torchaudio.functional.resample(
@ -1858,7 +1862,9 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
    @require_pyctcdecode
    @require_torchaudio
    def test_wav2vec2_with_lm_pool(self):
-        ds = load_dataset("mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True)
+        ds = load_dataset(
            "mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True, trust_remote_code=True
        )
        sample = next(iter(ds))
        resampled_audio = torchaudio.functional.resample(
@ -1957,7 +1963,9 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
        LANG_MAP = {"it": "ita", "es": "spa", "fr": "fra", "en": "eng"}
        def run_model(lang):
-            ds = load_dataset("mozilla-foundation/common_voice_11_0", lang, split="test", streaming=True)
+            ds = load_dataset(
                "mozilla-foundation/common_voice_11_0", lang, split="test", streaming=True, trust_remote_code=True
            )
            sample = next(iter(ds))
            wav2vec2_lang = LANG_MAP[lang]
--- a/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
+++ b/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
@ -463,7 +463,9 @@ class Wav2Vec2ProcessorWithLMTest(unittest.TestCase):
    def test_word_time_stamp_integration(self):
        import torch
-        ds = load_dataset("mozilla-foundation/common_voice_11_0", "en", split="train", streaming=True)
+        ds = load_dataset(
            "mozilla-foundation/common_voice_11_0", "en", split="train", streaming=True, trust_remote_code=True
        )
        ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000))
        ds_iter = iter(ds)
        sample = next(ds_iter)
--- a/tests/models/wavlm/test_modeling_wavlm.py
+++ b/tests/models/wavlm/test_modeling_wavlm.py
@ -473,7 +473,7 @@ class WavLMModelIntegrationTest(unittest.TestCase):
        return [x["array"] for x in speech_samples]
    def _load_superb(self, task, num_samples):
-        ds = load_dataset("anton-l/superb_dummy", task, split="test")
+        ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True)
        return ds[:num_samples]
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@ -1645,7 +1645,9 @@ class WhisperModelIntegrationTests(unittest.TestCase):
        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3")
        model.to(torch_device)
-        ds = load_dataset("facebook/multilingual_librispeech", "german", split="test", streaming=True)
+        ds = load_dataset(
            "facebook/multilingual_librispeech", "german", split="test", streaming=True, trust_remote_code=True
        )
        ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000))
        input_speech = next(iter(ds))["audio"]["array"]
@ -1712,10 +1714,11 @@ class WhisperModelIntegrationTests(unittest.TestCase):
        token = os.getenv("HF_HUB_READ_TOKEN", True)
        ds = load_dataset(
-            "hf-internal-testing/fixtures_common_voice",
+            "mozilla-foundation/common_voice_6_1",
            "ja",
            split="test",
            streaming=True,
            trust_remote_code=True,
            token=token,
        )
        ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000))
--- a/tests/pipelines/test_pipelines_audio_classification.py
+++ b/tests/pipelines/test_pipelines_audio_classification.py
@ -179,7 +179,7 @@ class AudioClassificationPipelineTests(unittest.TestCase):
        model = "superb/wav2vec2-base-superb-ks"
        audio_classifier = pipeline("audio-classification", model=model)
-        dataset = datasets.load_dataset("anton-l/superb_dummy", "ks", split="test")
+        dataset = datasets.load_dataset("anton-l/superb_dummy", "ks", split="test", trust_remote_code=True)
        audio = np.array(dataset[3]["speech"], dtype=np.float32)
        output = audio_classifier(audio, top_k=4)
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@ -265,7 +265,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
    @require_torch
    @require_pyctcdecode
    def test_large_model_pt_with_lm(self):
-        filename = hf_hub_download("Narsil/asr_dummy", filename="4.flac", repo_type="dataset")
+        dataset = load_dataset("Narsil/asr_dummy", streaming=True, trust_remote_code=True)
        third_item = next(iter(dataset["test"].skip(3)))
        filename = third_item["file"]
        speech_recognizer = pipeline(
            task="automatic-speech-recognition",
@ -386,7 +388,7 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
            chunk_length_s=8,
            stride_length_s=1,
        )
-        data = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True)
+        data = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True, trust_remote_code=True)
        sample = next(iter(data))
        res = pipe(sample["audio"]["array"])
@ -432,7 +434,7 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
            stride_length_s=1,
            return_language=True,
        )
-        data = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True)
+        data = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True, trust_remote_code=True)
        sample = next(iter(data))
        res = pipe(sample["audio"]["array"])
@ -487,7 +489,7 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
            task="automatic-speech-recognition",
            model="openai/whisper-tiny.en",
        )
-        data = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True)
+        data = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True, trust_remote_code=True)
        samples = [next(iter(data)) for _ in range(8)]
        audio = np.concatenate([sample["audio"]["array"] for sample in samples])
@ -1123,7 +1125,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
    @slow
    def test_speculative_decoding_whisper_non_distil(self):
        # Load data:
-        dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]")
+        dataset = load_dataset(
            "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]", trust_remote_code=True
        )
        sample = dataset[0]["audio"]
        # Load model:
@ -1165,7 +1169,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
    @slow
    def test_speculative_decoding_whisper_distil(self):
        # Load data:
-        dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]")
+        dataset = load_dataset(
            "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]", trust_remote_code=True
        )
        sample = dataset[0]["audio"]
        # Load model:
--- a/tests/pipelines/test_pipelines_image_segmentation.py
+++ b/tests/pipelines/test_pipelines_image_segmentation.py
@ -601,7 +601,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase):
        image_segmenter = pipeline("image-segmentation", model=model, image_processor=image_processor)
-        image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+        image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
        file = image[0]["file"]
        outputs = image_segmenter(file, threshold=threshold)
@ -655,7 +655,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase):
    def test_oneformer(self):
        image_segmenter = pipeline(model="shi-labs/oneformer_ade20k_swin_tiny")
-        image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+        image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
        file = image[0]["file"]
        outputs = image_segmenter(file, threshold=0.99)
        # Shortening by hashing
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@ -3799,20 +3799,8 @@ class ModelTesterMixin:
                self.skipTest(reason="Idefics currently (transformers==4.39.1) requires an image_attention_mask input")
            if config.model_type in ["sam"]:
                self.skipTest(reason="SAM requires an attention_mask input for relative positional embeddings")
            model = model_class(config)
            sub_models_supporting_sdpa = [
                module._supports_sdpa
                for name, module in model.named_modules()
                if isinstance(module, PreTrainedModel) and name != ""
            ]
            supports_sdpa_all_modules = (
                all(sub_models_supporting_sdpa) if len(sub_models_supporting_sdpa) > 0 else model._supports_sdpa
            )
            if not supports_sdpa_all_modules:
                self.skipTest(reason="This models' submodels does not support sdpa")
            with tempfile.TemporaryDirectory() as tmpdirname:
                model.save_pretrained(tmpdirname)
                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, attn_implementation="sdpa")
@ -3860,20 +3848,8 @@ class ModelTesterMixin:
                    "Cannot compile forward without an existing cache with Hybrid, as `torch._dynamo.mark_static_address` "
                    "is a forbidden call."
                )
            model = model_class(config)
            sub_models_supporting_sdpa = [
                module._supports_sdpa
                for name, module in model.named_modules()
                if isinstance(module, PreTrainedModel) and name != ""
            ]
            supports_sdpa_all_modules = (
                all(sub_models_supporting_sdpa) if len(sub_models_supporting_sdpa) > 0 else model._supports_sdpa
            )
            if not supports_sdpa_all_modules:
                self.skipTest(reason="This models' submodels does not support sdpa")
            with tempfile.TemporaryDirectory() as tmpdirname:
                model.save_pretrained(tmpdirname)
                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, attn_implementation="sdpa")
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@ -1792,25 +1792,6 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
            self.assertEqual(modeling_llama.apply_rotary_pos_emb, liger_rotary_pos_emb)
            self.assertTrue(isinstance(tiny_llama.model.norm, LigerRMSNorm))
    @require_liger_kernel
    def test_use_liger_kernel_custom_config_patching(self):
        # Ensure any monkey patching is cleaned up for subsequent tests
        with patch("transformers.models.llama.modeling_llama"):
            from liger_kernel.transformers import LigerRMSNorm
            config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4)
            tiny_llama = LlamaForCausalLM(config)
            args = TrainingArguments(
                self.get_auto_remove_tmp_dir(),
                use_liger_kernel=True,
                liger_kernel_config={"rms_norm": False},  # Don't apply Liger's RMSNorm
            )
            Trainer(tiny_llama, args)
            # Check that the RMSNorm kernel is not applied as specified in the config
            self.assertFalse(isinstance(tiny_llama.model.norm, LigerRMSNorm))
    @require_liger_kernel
    @require_torch_accelerator
    def test_use_liger_kernel_trainer(self):
@ -1829,29 +1810,6 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        # Check this works
        _ = trainer.train()
    @require_liger_kernel
    @require_torch_accelerator
    def test_use_liger_kernel_custom_config_trainer(self):
        # Check that trainer still works with liger kernel applied when using a custom config
        config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4)
        tiny_llama = LlamaForCausalLM(config)
        x = torch.randint(0, 100, (128,))
        train_dataset = RepeatDataset(x)
        args = TrainingArguments(
            self.get_auto_remove_tmp_dir(),
            learning_rate=1e-2,
            logging_steps=5,
            max_steps=20,
            use_liger_kernel=True,
            liger_kernel_config={"rms_norm": False, "cross_entropy": True, "fused_linear_cross_entropy": False},
        )
        trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
        # Check this works
        _ = trainer.train()
    @require_lomo
    @require_torch_accelerator
    def test_lomo(self):
--- a/utils/check_modular_conversion.py
+++ b/utils/check_modular_conversion.py
@ -133,19 +133,10 @@ if __name__ == "__main__":
    # Assuming there is a topological sort on the dependency mapping: if the file being checked and its dependencies
    # are not in the diff, then there it is guaranteed to have no differences. If no models are in the diff, then this
    # script will do nothing.
-    current_branch = subprocess.check_output(["git", "branch", "--show-current"], text=True).strip()
+    models_in_diff = get_models_in_diff()
-    if current_branch == "main":
+    if not models_in_diff:
-        console.print(
+        console.print("[bold green]No models files or model tests in the diff, skipping modular checks[/bold green]")
-            "[bold red]You are developing on the main branch. We cannot identify the list of changed files and will have to check all files. This may take a while.[/bold red]"
+        exit(0)
        )
        models_in_diff = {file_path.split("/")[-2] for file_path in args.files}
    else:
        models_in_diff = get_models_in_diff()
        if not models_in_diff:
            console.print(
                "[bold green]No models files or model tests in the diff, skipping modular checks[/bold green]"
            )
            exit(0)
    skipped_models = set()
    non_matching_files = 0
@ -158,8 +149,7 @@ if __name__ == "__main__":
                skipped_models.add(model_name)
                continue
            non_matching_files += compare_files(modular_file_path, args.fix_and_overwrite)
-            if current_branch != "main":
+            models_in_diff = get_models_in_diff()  # When overwriting, the diff changes
                models_in_diff = get_models_in_diff()  # When overwriting, the diff changes
    else:
        new_ordered_files = []
        for modular_file_path in ordered_files:
--- a/utils/get_runner_map.py
+++ b/utils/get_runner_map.py
@ -0,0 +1,59 @@
 # Copyright 2025 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This script is used to get a map containing the information of runners to use in GitHub Actions workflow files.
 This is meant to be a temporary file that helps us to switch progressively from T4 to A10 runners.
 The data is stored in a Hub repository [hf-internal-testing/transformers_daily_ci](https://huggingface.co/datasets/hf-internal-testing/transformers_daily_ci/blob/main/runner_map.json).
 Currently, in that file, we specify the models for which we want to run the tests with T4 runners to avoid many test failures showing on the CI reports.
 We will work on the tests toward to use A10 for all CI jobs.
 """
 import os
 import requests
 if __name__ == "__main__":
    # T4
    t4_runners = {
        "single-gpu": "aws-g4dn-4xlarge-cache",
        "multi-gpu": "aws-g4dn-12xlarge-cache",
    }
    # A10
    a10_runners = {
        "single-gpu": "aws-g5-4xlarge-cache",
        "multi-gpu": "aws-g5-12xlarge-cache",
    }
    tests = os.getcwd()
    model_tests = os.listdir(os.path.join(tests, "models"))
    d1 = sorted(filter(os.path.isdir, os.listdir(tests)))
    d2 = sorted(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))
    d1.remove("models")
    d = d2 + d1
    response = requests.get("https://huggingface.co/datasets/hf-internal-testing/transformers_daily_ci/resolve/main/runner_map.json")
    # The models that we want to run with T4 runners
    runner_map = response.json()
    for key in d:
        if key in runner_map:
            runner_map[key] = t4_runners
        else:
            runner_map[key] = a10_runners
    print(runner_map)
--- a/utils/notification_service.py
+++ b/utils/notification_service.py
@ -1494,7 +1494,7 @@ if __name__ == "__main__":
        other_ci_artifacts=other_ci_artifacts,
    )
-    # send report only if there is any failure (for push CI)
+    # # send report only if there is any failure (for push CI)
-    if message.n_failures or (ci_event != "push" and not ci_event.startswith("Push CI (AMD)")):
+    # if message.n_failures or (ci_event != "push" and not ci_event.startswith("Push CI (AMD)")):
-        message.post()
+    #     message.post()
-        message.post_reply()
+    #     message.post_reply()
--- a/utils/split_model_tests.py
+++ b/utils/split_model_tests.py
@ -62,4 +62,5 @@ if __name__ == "__main__":
        start = end
        end = start + num_jobs_per_splits + (1 if idx < num_jobs % args.num_splits else 0)
        model_splits.append(d[start:end])
    model_splits = [['models/vit', 'generation'], ['models/clip', 'models/vits']]
    print(model_splits)
Author	SHA1	Message	Date
ydshieh	5e54865c88	try	2025-06-19 18:57:58 +02:00
ydshieh	0cf5408dea	trigger	2025-06-19 18:54:02 +02:00
ydshieh	8e6fa6b59f	[test all]	2025-06-19 18:53:54 +02:00
ydshieh	e304be32e9	try	2025-06-19 18:45:07 +02:00
ydshieh	b898e54c73	try	2025-06-19 18:43:59 +02:00
ydshieh	7e00e28ccb	try	2025-06-19 18:37:45 +02:00
ydshieh	a02f0a871e	try	2025-06-19 18:04:49 +02:00
ydshieh	8b37093b17	try	2025-06-19 18:02:27 +02:00
ydshieh	d6ebe4d6ec	try	2025-06-19 17:43:48 +02:00
ydshieh	69f5fa1940	try	2025-06-19 17:27:22 +02:00
ydshieh	6dc035bd97	try	2025-06-19 17:19:17 +02:00
ydshieh	741f6632d5	try	2025-06-19 17:15:16 +02:00
ydshieh	c593e135cb	try	2025-06-19 17:05:57 +02:00
ydshieh	a436e29cae	try	2025-06-19 16:40:07 +02:00
ydshieh	b4b503befd	try	2025-06-19 16:38:18 +02:00
ydshieh	d0e5cea195	try	2025-06-19 16:14:39 +02:00
ydshieh	8f16ac0fae	try	2025-06-19 16:02:44 +02:00
ydshieh	e336a60875	try	2025-06-19 15:22:44 +02:00
ydshieh	c02d17e00d	try	2025-06-19 15:05:06 +02:00
ydshieh	56b91f3314	try	2025-06-19 14:26:24 +02:00