trigger for Remove script datasets in tests #38940

pick
fix integration test
2025-10-22 02:08:58 +08:00 · 2025-06-20 20:12:47 +02:00 · 2025-06-20 20:10:35 +02:00 · 2025-06-20 19:54:27 +02:00 · 2025-06-20 17:43:26 +02:00 · 2025-06-20 17:42:17 +02:00
65 changed files with 430 additions and 822 deletions
--- a/.github/workflows/model_jobs.yml
+++ b/.github/workflows/model_jobs.yml
@ -12,8 +12,8 @@ on:
      slice_id:
        required: true
        type: number
-      runner_map:
-        required: false
+      runner:
+        required: true
        type: string
      docker:
        required: true
@ -45,7 +45,7 @@ jobs:
      matrix:
        folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }}
    runs-on:
-      group: ${{ fromJson(inputs.runner_map)[matrix.folders][inputs.machine_type] }}
+      group: '${{ inputs.machine_type }}'
    container:
      image: ${{ inputs.docker }}
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
--- a/.github/workflows/model_jobs_amd.yml
+++ b/.github/workflows/model_jobs_amd.yml
@ -0,0 +1,128 @@
+name: model jobs
+
+on:
+  workflow_call:
+    inputs:
+      folder_slices:
+        required: true
+        type: string
+      machine_type:
+        required: true
+        type: string
+      slice_id:
+        required: true
+        type: number
+      runner:
+        required: true
+        type: string
+      docker:
+        required: true
+        type: string
+
+env:
+  HF_HOME: /mnt/cache
+  TRANSFORMERS_IS_CI: yes
+  OMP_NUM_THREADS: 8
+  MKL_NUM_THREADS: 8
+  RUN_SLOW: yes
+  # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access.
+  # This token is created under the bot `hf-transformers-bot`.
+  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
+  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
+  TF_FORCE_GPU_ALLOW_GROWTH: true
+  CUDA_VISIBLE_DEVICES: 0,1
+
+jobs:
+  run_models_gpu:
+    name: " "
+    strategy:
+      max-parallel: 1  # For now, not to parallelize. Can change later if it works well.
+      fail-fast: false
+      matrix:
+        folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }}
+    runs-on: ['${{ inputs.machine_type }}', self-hosted, amd-gpu, '${{ inputs.runner }}']
+    container:
+      image: ${{ inputs.docker }}
+      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: Echo input and matrix info
+        shell: bash
+        run: |
+          echo "${{ inputs.folder_slices }}"
+          echo "${{ matrix.folders }}"
+          echo "${{ toJson(fromJson(inputs.folder_slices)[inputs.slice_id]) }}"
+
+      - name: Echo folder ${{ matrix.folders }}
+        shell: bash
+        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
+        # set the artifact folder names (because the character `/` is not allowed).
+        run: |
+          echo "${{ matrix.folders }}"
+          matrix_folders=${{ matrix.folders }}
+          matrix_folders=${matrix_folders/'models/'/'models_'}
+          echo "$matrix_folders"
+          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
+
+      - name: Update clone
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ github.sha }}
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: Update / Install some packages (for Past CI)
+        if: ${{ contains(inputs.docker, '-past-') }}
+        working-directory: /transformers
+        run: |
+          python3 -m pip install -U datasets
+
+      - name: Update / Install some packages (for Past CI)
+        if: ${{ contains(inputs.docker, '-past-') && contains(inputs.docker, '-pytorch-') }}
+        working-directory: /transformers
+        run: |
+          python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
+
+      - name: ROCM-SMI
+        run: |
+          rocm-smi
+
+      - name: ROCM-INFO
+        run: |
+          rocminfo  | grep "Agent" -A 14
+
+      - name: Show ROCR environment
+        run: |
+          echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Run all tests on GPU
+        working-directory: /transformers
+        run: python3 -m pytest -rsfE -v --make-reports=${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}  -m "not not_device_test"
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
+
+      - name: Run test
+        shell: bash
+        run: |
+          mkdir -p /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
+          echo "hello" > /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt
+          echo "${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports"
+
+      - name: "Test suite reports artifacts: ${{ inputs.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ inputs.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
+          path: /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
--- a/.github/workflows/self-scheduled-caller.yml
+++ b/.github/workflows/self-scheduled-caller.yml
@ -7,7 +7,7 @@ on:
    - cron: "17 2 * * *"
  push:
    branches:
-      - allow_ci_to_use_a10
+      - trigger-remove-script-datasets-in-tests
  workflow_dispatch:
    inputs:
      prev_workflow_run_id:
@ -22,10 +22,10 @@ on:
        default: ""


-# Used for `push` to easily modify the target workflow runs to compare against
+# Used for `push` to easily modiffy the target workflow runs to compare against
 env:
    prev_workflow_run_id: ""
-    other_workflow_run_id: ""
+    other_workflow_run_id: "15770139098"


 jobs:
@ -51,68 +51,8 @@ jobs:
    with:
      job: run_models_gpu
      slack_report_channel: "#transformers-ci-daily-models"
+      runner: daily-ci
      docker: huggingface/transformers-all-latest-gpu
      ci_event: Daily CI
      report_repo_id: hf-internal-testing/transformers_daily_ci
    secrets: inherit
-
-#  torch-pipeline:
-#    name: Torch pipeline CI
-#    uses: ./.github/workflows/self-scheduled.yml
-#    with:
-#      job: run_pipelines_torch_gpu
-#      slack_report_channel: "#transformers-ci-daily-pipeline-torch"
-#      runner: daily-ci
-#      docker: huggingface/transformers-pytorch-gpu
-#      ci_event: Daily CI
-#      report_repo_id: hf-internal-testing/transformers_daily_ci
-#    secrets: inherit
-#
-#  example-ci:
-#    name: Example CI
-#    uses: ./.github/workflows/self-scheduled.yml
-#    with:
-#      job: run_examples_gpu
-#      slack_report_channel: "#transformers-ci-daily-examples"
-#      runner: daily-ci
-#      docker: huggingface/transformers-all-latest-gpu
-#      ci_event: Daily CI
-#      report_repo_id: hf-internal-testing/transformers_daily_ci
-#    secrets: inherit
-#
-#  trainer-fsdp-ci:
-#    name: Trainer/FSDP CI
-#    uses: ./.github/workflows/self-scheduled.yml
-#    with:
-#      job: run_trainer_and_fsdp_gpu
-#      slack_report_channel: "#transformers-ci-daily-training"
-#      runner: daily-ci
-#      docker: huggingface/transformers-all-latest-gpu
-#      ci_event: Daily CI
-#      report_repo_id: hf-internal-testing/transformers_daily_ci
-#    secrets: inherit
-#
-#  deepspeed-ci:
-#    name: DeepSpeed CI
-#    uses: ./.github/workflows/self-scheduled.yml
-#    with:
-#      job: run_torch_cuda_extensions_gpu
-#      slack_report_channel: "#transformers-ci-daily-training"
-#      runner: daily-ci
-#      docker: huggingface/transformers-pytorch-deepspeed-latest-gpu
-#      ci_event: Daily CI
-#      working-directory-prefix: /workspace
-#      report_repo_id: hf-internal-testing/transformers_daily_ci
-#    secrets: inherit
-#
-#  quantization-ci:
-#    name: Quantization CI
-#    uses: ./.github/workflows/self-scheduled.yml
-#    with:
-#      job: run_quantization_torch_gpu
-#      slack_report_channel: "#transformers-ci-daily-quantization"
-#      runner: daily-ci
-#      docker: huggingface/transformers-quantization-latest-gpu
-#      ci_event: Daily CI
-#      report_repo_id: hf-internal-testing/transformers_daily_ci
-#    secrets: inherit
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -15,6 +15,9 @@ on:
      slack_report_channel:
        required: true
        type: string
+      runner:
+        required: true
+        type: string
      docker:
        required: true
        type: string
@ -59,7 +62,6 @@ jobs:
    outputs:
      folder_slices: ${{ steps.set-matrix.outputs.folder_slices }}
      slice_ids: ${{ steps.set-matrix.outputs.slice_ids }}
-      runner_map: ${{ steps.set-matrix.outputs.runner_map }}
      quantization_matrix: ${{ steps.set-matrix-quantization.outputs.quantization_matrix }}
    steps:
      - name: Update clone
@ -86,7 +88,6 @@ jobs:
          if [ "${{ inputs.job }}" = "run_models_gpu" ]; then
            echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
            echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
-            echo "runner_map=$(python3 ../utils/get_runner_map.py)" >> $GITHUB_OUTPUT
          elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then
            echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT
            echo "slice_ids=[0, 1]" >> $GITHUB_OUTPUT
@ -110,14 +111,14 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [single-gpu, multi-gpu]
+        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
        slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
    uses: ./.github/workflows/model_jobs.yml
    with:
      folder_slices: ${{ needs.setup.outputs.folder_slices }}
      machine_type: ${{ matrix.machine_type }}
      slice_id: ${{ matrix.slice_id }}
-      runner_map: ${{ needs.setup.outputs.runner_map }}
+      runner: ${{ inputs.runner }}
      docker: ${{ inputs.docker }}
    secrets: inherit

@ -135,6 +136,7 @@ jobs:
      folder_slices: ${{ needs.setup.outputs.folder_slices }}
      machine_type: ${{ matrix.machine_type }}
      slice_id: ${{ matrix.slice_id }}
+      runner: ${{ inputs.runner }}
      docker: ${{ inputs.docker }}
      report_name_prefix: run_trainer_and_fsdp_gpu
    secrets: inherit
--- a/18
+++ b/18
@ -8,13 +8,19 @@ check_dirs := examples tests src utils
 exclude_folders :=  ""

 modified_only_fixup:
-	$(eval modified_py_files := $(shell python utils/get_modified_files.py $(check_dirs)))
-	@if test -n "$(modified_py_files)"; then \
-		echo "Checking/fixing $(modified_py_files)"; \
-		ruff check $(modified_py_files) --fix --exclude $(exclude_folders); \
-		ruff format $(modified_py_files) --exclude $(exclude_folders);\
+	@current_branch=$$(git branch --show-current); \
+	if [ "$$current_branch" = "main" ]; then \
+		echo "On main branch, running 'style' target instead..."; \
+		$(MAKE) style; \
 	else \
-		echo "No library .py files were modified"; \
+		modified_py_files=$$(python utils/get_modified_files.py $(check_dirs)); \
+		if [ -n "$$modified_py_files" ]; then \
+			echo "Checking/fixing files: $${modified_py_files}"; \
+			ruff check $${modified_py_files} --fix --exclude $(exclude_folders); \
+			ruff format $${modified_py_files} --exclude $(exclude_folders); \
+		else \
+			echo "No library .py files were modified"; \
+		fi; \
 	fi

 # Update src/transformers/dependency_versions_table.py
--- a/docs/source/en/model_doc/seamless_m4t.md
+++ b/docs/source/en/model_doc/seamless_m4t.md
@ -56,7 +56,7 @@ Here is how to use the processor to process text and audio:
 ```python
 >>> # let's load an audio sample from an Arabic speech corpus
 >>> from datasets import load_dataset
->>> dataset = load_dataset("arabic_speech_corpus", split="test", streaming=True, trust_remote_code=True)
+>>> dataset = load_dataset("halabi2016/arabic_speech_corpus", split="test", streaming=True)
 >>> audio_sample = next(iter(dataset))["audio"]

 >>> # now, process it
--- a/docs/source/en/model_doc/seamless_m4t_v2.md
+++ b/docs/source/en/model_doc/seamless_m4t_v2.md
@ -56,7 +56,7 @@ Here is how to use the processor to process text and audio:
 ```python
 >>> # let's load an audio sample from an Arabic speech corpus
 >>> from datasets import load_dataset
->>> dataset = load_dataset("arabic_speech_corpus", split="test", streaming=True, trust_remote_code=True)
+>>> dataset = load_dataset("halabi2016/arabic_speech_corpus", split="test", streaming=True)
 >>> audio_sample = next(iter(dataset))["audio"]

 >>> # now, process it
--- a/docs/source/en/trainer.md
+++ b/docs/source/en/trainer.md
@ -493,6 +493,33 @@ training_args = TrainingArguments(
 )
 ```

+You can also configure which specific kernels to apply using the `liger_kernel_config` parameter. This dict is passed as keyword arguments to the `_apply_liger_kernel_to_instance` function, allowing fine-grained control over kernel usage. Available options vary by model but typically include: `rope`, `swiglu`, `cross_entropy`, `fused_linear_cross_entropy`, `rms_norm`, etc.
+
+```py
+from transformers import TrainingArguments
+
+# Apply only specific kernels
+training_args = TrainingArguments(
+    output_dir="your-model",
+    learning_rate=2e-5,
+    per_device_train_batch_size=16,
+    per_device_eval_batch_size=16,
+    num_train_epochs=2,
+    weight_decay=0.01,
+    eval_strategy="epoch",
+    save_strategy="epoch",
+    load_best_model_at_end=True,
+    push_to_hub=True,
+    use_liger_kernel=True,
+    liger_kernel_config={
+        "rope": True,
+        "cross_entropy": True,
+        "rms_norm": False,  # Don't apply Liger's RMSNorm kernel
+        "swiglu": True,
+    }
+)
+```
+
 ### NEFTune

 [NEFTune](https://hf.co/papers/2310.05914) adds noise to the embedding vectors during training to improve model performance. Enable it in [`Trainer`] with the `neftune_noise_alpha` parameter in [`TrainingArguments`] to control how much noise is added.
--- a/examples/flax/test_flax_examples.py
+++ b/examples/flax/test_flax_examples.py
@ -264,7 +264,6 @@ class ExamplesTests(TestCasePlus):
            --dataset_config clean
            --train_split_name validation
            --eval_split_name validation
-            --trust_remote_code
            --output_dir {tmp_dir}
            --overwrite_output_dir
            --num_train_epochs=2
--- a/examples/pytorch/test_accelerate_examples.py
+++ b/examples/pytorch/test_accelerate_examples.py
@ -312,7 +312,6 @@ class ExamplesTestsNoTrainer(TestCasePlus):
            {self.examples_dir}/pytorch/image-classification/run_image_classification_no_trainer.py
            --model_name_or_path google/vit-base-patch16-224-in21k
            --dataset_name hf-internal-testing/cats_vs_dogs_sample
-            --trust_remote_code
            --learning_rate 1e-4
            --per_device_train_batch_size 2
            --per_device_eval_batch_size 1
--- a/examples/pytorch/test_pytorch_examples.py
+++ b/examples/pytorch/test_pytorch_examples.py
@ -390,7 +390,6 @@ class ExamplesTests(TestCasePlus):
            --output_dir {tmp_dir}
            --model_name_or_path google/vit-base-patch16-224-in21k
            --dataset_name hf-internal-testing/cats_vs_dogs_sample
-            --trust_remote_code
            --do_train
            --do_eval
            --learning_rate 1e-4
@ -424,7 +423,6 @@ class ExamplesTests(TestCasePlus):
            --dataset_config_name clean
            --train_split_name validation
            --eval_split_name validation
-            --trust_remote_code
            --do_train
            --do_eval
            --learning_rate 1e-4
@ -455,7 +453,6 @@ class ExamplesTests(TestCasePlus):
            --dataset_config_name clean
            --train_split_name validation
            --eval_split_name validation
-            --trust_remote_code
            --do_train
            --do_eval
            --learning_rate 1e-4
@ -488,7 +485,6 @@ class ExamplesTests(TestCasePlus):
            --dataset_config_name clean
            --train_split_name validation
            --eval_split_name validation
-            --trust_remote_code
            --do_train
            --do_eval
            --learning_rate 1e-4
@ -516,7 +512,6 @@ class ExamplesTests(TestCasePlus):
            --output_dir {tmp_dir}
            --model_name_or_path hf-internal-testing/tiny-random-wav2vec2
            --dataset_name anton-l/superb_demo
-            --trust_remote_code
            --dataset_config_name ks
            --train_split_name test
            --eval_split_name test
@ -551,7 +546,6 @@ class ExamplesTests(TestCasePlus):
            --dataset_name hf-internal-testing/librispeech_asr_dummy
            --dataset_config_names clean
            --dataset_split_names validation
-            --trust_remote_code
            --learning_rate 1e-4
            --per_device_train_batch_size 4
            --per_device_eval_batch_size 4
@ -572,7 +566,6 @@ class ExamplesTests(TestCasePlus):
            run_mae.py
            --output_dir {tmp_dir}
            --dataset_name hf-internal-testing/cats_vs_dogs_sample
-            --trust_remote_code
            --do_train
            --do_eval
            --learning_rate 1e-4
--- a/examples/tensorflow/test_tensorflow_examples.py
+++ b/examples/tensorflow/test_tensorflow_examples.py
@ -315,7 +315,6 @@ class ExamplesTests(TestCasePlus):
        testargs = f"""
            run_image_classification.py
            --dataset_name hf-internal-testing/cats_vs_dogs_sample
-            --trust_remote_code
            --model_name_or_path microsoft/resnet-18
            --do_train
            --do_eval
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@ -29,7 +29,6 @@ import warnings
 from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from contextlib import contextmanager
-from dataclasses import dataclass
 from enum import Enum
 from functools import partial, wraps
 from threading import Thread
@ -41,7 +40,6 @@ from huggingface_hub import split_torch_state_dict_into_shards
 from packaging import version
 from torch import Tensor, nn
 from torch.distributions import constraints
-from torch.nn import CrossEntropyLoss, Identity
 from torch.utils.checkpoint import checkpoint

 from transformers.utils import is_torchao_available
@ -50,7 +48,6 @@ from transformers.utils import is_torchao_available
 if is_torchao_available():
    from torchao.quantization import Int4WeightOnlyConfig

-from .activations import get_activation
 from .configuration_utils import PretrainedConfig
 from .dynamic_module_utils import custom_object_save
 from .generation import CompileConfig, GenerationConfig
@ -98,7 +95,6 @@ from .utils import (
    WEIGHTS_INDEX_NAME,
    WEIGHTS_NAME,
    ContextManagers,
-    ModelOutput,
    PushToHubMixin,
    cached_file,
    check_torch_load_is_safe,
@ -123,7 +119,6 @@ from .utils import (
    is_torch_xla_available,
    is_torch_xpu_available,
    logging,
-    replace_return_docstrings,
    strtobool,
 )
 from .utils.generic import GeneralInterface
@ -5624,453 +5619,6 @@ if PreTrainedModel.push_to_hub.__doc__ is not None:
    )


-class PoolerStartLogits(nn.Module):
-    """
-    Compute SQuAD start logits from sequence hidden states.
-
-    Args:
-        config ([`PretrainedConfig`]):
-            The config used by the model, will be used to grab the `hidden_size` of the model.
-    """
-
-    def __init__(self, config: PretrainedConfig):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, 1)
-        logger.warning_once(
-            "[DEPRECATION WARNING] `PoolerStartLogits` is deprecated and will be removed in v4.53. "
-            "Please use model-specific class, e.g. `XLMPoolerStartLogits`."
-        )
-
-    def forward(
-        self, hidden_states: torch.FloatTensor, p_mask: Optional[torch.FloatTensor] = None
-    ) -> torch.FloatTensor:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
-                The final hidden states of the model.
-            p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
-                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
-                should be masked.
-
-        Returns:
-            `torch.FloatTensor`: The start logits for SQuAD.
-        """
-        x = self.dense(hidden_states).squeeze(-1)
-
-        if p_mask is not None:
-            if get_parameter_dtype(self) == torch.float16:
-                x = x * (1 - p_mask) - 65500 * p_mask
-            else:
-                x = x * (1 - p_mask) - 1e30 * p_mask
-
-        return x
-
-
-class PoolerEndLogits(nn.Module):
-    """
-    Compute SQuAD end logits from sequence hidden states.
-
-    Args:
-        config ([`PretrainedConfig`]):
-            The config used by the model, will be used to grab the `hidden_size` of the model and the `layer_norm_eps`
-            to use.
-    """
-
-    def __init__(self, config: PretrainedConfig):
-        super().__init__()
-        self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
-        self.activation = nn.Tanh()
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dense_1 = nn.Linear(config.hidden_size, 1)
-        logger.warning_once(
-            "[DEPRECATION WARNING] `PoolerEndLogits` is deprecated and will be removed in v4.53. "
-            "Please use model-specific class, e.g. `XLMPoolerEndLogits`."
-        )
-
-    def forward(
-        self,
-        hidden_states: torch.FloatTensor,
-        start_states: Optional[torch.FloatTensor] = None,
-        start_positions: Optional[torch.LongTensor] = None,
-        p_mask: Optional[torch.FloatTensor] = None,
-    ) -> torch.FloatTensor:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
-                The final hidden states of the model.
-            start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*):
-                The hidden states of the first tokens for the labeled span.
-            start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-                The position of the first token for the labeled span.
-            p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
-                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
-                should be masked.
-
-        <Tip>
-
-        One of `start_states` or `start_positions` should be not `None`. If both are set, `start_positions` overrides
-        `start_states`.
-
-        </Tip>
-
-        Returns:
-            `torch.FloatTensor`: The end logits for SQuAD.
-        """
-        assert start_states is not None or start_positions is not None, (
-            "One of start_states, start_positions should be not None"
-        )
-        if start_positions is not None:
-            slen, hsz = hidden_states.shape[-2:]
-            start_positions = start_positions[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
-            start_states = hidden_states.gather(-2, start_positions)  # shape (bsz, 1, hsz)
-            start_states = start_states.expand(-1, slen, -1)  # shape (bsz, slen, hsz)
-
-        x = self.dense_0(torch.cat([hidden_states, start_states], dim=-1))
-        x = self.activation(x)
-        x = self.LayerNorm(x)
-        x = self.dense_1(x).squeeze(-1)
-
-        if p_mask is not None:
-            if get_parameter_dtype(self) == torch.float16:
-                x = x * (1 - p_mask) - 65500 * p_mask
-            else:
-                x = x * (1 - p_mask) - 1e30 * p_mask
-
-        return x
-
-
-class PoolerAnswerClass(nn.Module):
-    """
-    Compute SQuAD 2.0 answer class from classification and start tokens hidden states.
-
-    Args:
-        config ([`PretrainedConfig`]):
-            The config used by the model, will be used to grab the `hidden_size` of the model.
-    """
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
-        self.activation = nn.Tanh()
-        self.dense_1 = nn.Linear(config.hidden_size, 1, bias=False)
-        logger.warning_once(
-            "[DEPRECATION WARNING] `PoolerAnswerClass` is deprecated and will be removed in v4.53. "
-            "Please use model-specific class, e.g. `XLMPoolerAnswerClass`."
-        )
-
-    def forward(
-        self,
-        hidden_states: torch.FloatTensor,
-        start_states: Optional[torch.FloatTensor] = None,
-        start_positions: Optional[torch.LongTensor] = None,
-        cls_index: Optional[torch.LongTensor] = None,
-    ) -> torch.FloatTensor:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
-                The final hidden states of the model.
-            start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*):
-                The hidden states of the first tokens for the labeled span.
-            start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-                The position of the first token for the labeled span.
-            cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-                Position of the CLS token for each sentence in the batch. If `None`, takes the last token.
-
-        <Tip>
-
-        One of `start_states` or `start_positions` should be not `None`. If both are set, `start_positions` overrides
-        `start_states`.
-
-        </Tip>
-
-        Returns:
-            `torch.FloatTensor`: The SQuAD 2.0 answer class.
-        """
-        # No dependency on end_feature so that we can obtain one single `cls_logits` for each sample.
-        hsz = hidden_states.shape[-1]
-        assert start_states is not None or start_positions is not None, (
-            "One of start_states, start_positions should be not None"
-        )
-        if start_positions is not None:
-            start_positions = start_positions[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
-            start_states = hidden_states.gather(-2, start_positions).squeeze(-2)  # shape (bsz, hsz)
-
-        if cls_index is not None:
-            cls_index = cls_index[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
-            cls_token_state = hidden_states.gather(-2, cls_index).squeeze(-2)  # shape (bsz, hsz)
-        else:
-            cls_token_state = hidden_states[:, -1, :]  # shape (bsz, hsz)
-
-        x = self.dense_0(torch.cat([start_states, cls_token_state], dim=-1))
-        x = self.activation(x)
-        x = self.dense_1(x).squeeze(-1)
-
-        return x
-
-
-@dataclass
-class SquadHeadOutput(ModelOutput):
-    """
-    Base class for outputs of question answering models using a [`~modeling_utils.SQuADHead`].
-
-    Args:
-        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned if both `start_positions` and `end_positions` are provided):
-            Classification loss as the sum of start token, end token (and is_impossible if provided) classification
-            losses.
-        start_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
-            Log probabilities for the top config.start_n_top start token possibilities (beam-search).
-        start_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
-            Indices for the top config.start_n_top start token possibilities (beam-search).
-        end_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
-            Log probabilities for the top `config.start_n_top * config.end_n_top` end token possibilities
-            (beam-search).
-        end_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
-            Indices for the top `config.start_n_top * config.end_n_top` end token possibilities (beam-search).
-        cls_logits (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
-            Log probabilities for the `is_impossible` label of the answers.
-
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    start_top_log_probs: Optional[torch.FloatTensor] = None
-    start_top_index: Optional[torch.LongTensor] = None
-    end_top_log_probs: Optional[torch.FloatTensor] = None
-    end_top_index: Optional[torch.LongTensor] = None
-    cls_logits: Optional[torch.FloatTensor] = None
-
-    def __post_init__(self):
-        logger.warning_once(
-            "[DEPRECATION WARNING] `SquadHeadOutput` is deprecated and will be removed in v4.53. "
-            "Please use model-specific class, e.g. `XLMSquadHeadOutput`."
-        )
-
-
-class SQuADHead(nn.Module):
-    r"""
-    A SQuAD head inspired by XLNet.
-
-    Args:
-        config ([`PretrainedConfig`]):
-            The config used by the model, will be used to grab the `hidden_size` of the model and the `layer_norm_eps`
-            to use.
-    """
-
-    def __init__(self, config):
-        super().__init__()
-        self.start_n_top = config.start_n_top
-        self.end_n_top = config.end_n_top
-
-        self.start_logits = PoolerStartLogits(config)
-        self.end_logits = PoolerEndLogits(config)
-        self.answer_class = PoolerAnswerClass(config)
-
-        logger.warning_once(
-            "[DEPRECATION WARNING] `SQuADHead` is deprecated and will be removed in v4.53. "
-            "Please use model-specific class, e.g. `XLMSQuADHead`."
-        )
-
-    @replace_return_docstrings(output_type=SquadHeadOutput, config_class=PretrainedConfig)
-    def forward(
-        self,
-        hidden_states: torch.FloatTensor,
-        start_positions: Optional[torch.LongTensor] = None,
-        end_positions: Optional[torch.LongTensor] = None,
-        cls_index: Optional[torch.LongTensor] = None,
-        is_impossible: Optional[torch.LongTensor] = None,
-        p_mask: Optional[torch.FloatTensor] = None,
-        return_dict: bool = False,
-    ) -> Union[SquadHeadOutput, tuple[torch.FloatTensor]]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
-                Final hidden states of the model on the sequence tokens.
-            start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-                Positions of the first token for the labeled span.
-            end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-                Positions of the last token for the labeled span.
-            cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-                Position of the CLS token for each sentence in the batch. If `None`, takes the last token.
-            is_impossible (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-                Whether the question has a possible answer in the paragraph or not.
-            p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
-                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
-                should be masked.
-            return_dict (`bool`, *optional*, defaults to `False`):
-                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-
-        Returns:
-        """
-        start_logits = self.start_logits(hidden_states, p_mask=p_mask)
-
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, let's remove the dimension added by batch splitting
-            for x in (start_positions, end_positions, cls_index, is_impossible):
-                if x is not None and x.dim() > 1:
-                    x.squeeze_(-1)
-
-            # during training, compute the end logits based on the ground truth of the start position
-            end_logits = self.end_logits(hidden_states, start_positions=start_positions, p_mask=p_mask)
-
-            loss_fct = CrossEntropyLoss()
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-
-            if cls_index is not None and is_impossible is not None:
-                # Predict answerability from the representation of CLS and START
-                cls_logits = self.answer_class(hidden_states, start_positions=start_positions, cls_index=cls_index)
-                loss_fct_cls = nn.BCEWithLogitsLoss()
-                cls_loss = loss_fct_cls(cls_logits, is_impossible)
-
-                # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss
-                total_loss += cls_loss * 0.5
-
-            return SquadHeadOutput(loss=total_loss) if return_dict else (total_loss,)
-
-        else:
-            # during inference, compute the end logits based on beam search
-            bsz, slen, hsz = hidden_states.size()
-            start_log_probs = nn.functional.softmax(start_logits, dim=-1)  # shape (bsz, slen)
-
-            start_top_log_probs, start_top_index = torch.topk(
-                start_log_probs, self.start_n_top, dim=-1
-            )  # shape (bsz, start_n_top)
-            start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz)  # shape (bsz, start_n_top, hsz)
-            start_states = torch.gather(hidden_states, -2, start_top_index_exp)  # shape (bsz, start_n_top, hsz)
-            start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1)  # shape (bsz, slen, start_n_top, hsz)
-
-            hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(
-                start_states
-            )  # shape (bsz, slen, start_n_top, hsz)
-            p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None
-            end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask)
-            end_log_probs = nn.functional.softmax(end_logits, dim=1)  # shape (bsz, slen, start_n_top)
-
-            end_top_log_probs, end_top_index = torch.topk(
-                end_log_probs, self.end_n_top, dim=1
-            )  # shape (bsz, end_n_top, start_n_top)
-            end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top)
-            end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top)
-
-            start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs)
-            cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index)
-
-            if not return_dict:
-                return (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits)
-            else:
-                return SquadHeadOutput(
-                    start_top_log_probs=start_top_log_probs,
-                    start_top_index=start_top_index,
-                    end_top_log_probs=end_top_log_probs,
-                    end_top_index=end_top_index,
-                    cls_logits=cls_logits,
-                )
-
-
-class SequenceSummary(nn.Module):
-    r"""
-    Compute a single vector summary of a sequence hidden states.
-
-    Args:
-        config ([`PretrainedConfig`]):
-            The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
-            config class of your model for the default values it uses):
-
-            - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:
-
-                - `"last"` -- Take the last token hidden state (like XLNet)
-                - `"first"` -- Take the first token hidden state (like Bert)
-                - `"mean"` -- Take the mean of all tokens hidden states
-                - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
-                - `"attn"` -- Not implemented now, use multi-head attention
-
-            - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
-            - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
-              (otherwise to `config.hidden_size`).
-            - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
-              another string or `None` will add no activation.
-            - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
-            - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
-    """
-
-    def __init__(self, config: PretrainedConfig):
-        super().__init__()
-
-        self.summary_type = getattr(config, "summary_type", "last")
-        if self.summary_type == "attn":
-            # We should use a standard multi-head attention module with absolute positional embedding for that.
-            # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
-            # We can probably just use the multi-head attention module of PyTorch >=1.1.0
-            raise NotImplementedError
-
-        self.summary = Identity()
-        if hasattr(config, "summary_use_proj") and config.summary_use_proj:
-            if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0:
-                num_classes = config.num_labels
-            else:
-                num_classes = config.hidden_size
-            self.summary = nn.Linear(config.hidden_size, num_classes)
-
-        activation_string = getattr(config, "summary_activation", None)
-        self.activation: Callable = get_activation(activation_string) if activation_string else Identity()
-
-        self.first_dropout = Identity()
-        if hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0:
-            self.first_dropout = nn.Dropout(config.summary_first_dropout)
-
-        self.last_dropout = Identity()
-        if hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0:
-            self.last_dropout = nn.Dropout(config.summary_last_dropout)
-
-        logger.warning_once(
-            "[DEPRECATION WARNING] `SequenceSummary` is deprecated and will be removed in v4.53. "
-            "Please use model-specific class, e.g. `XLMSequenceSummary`."
-        )
-
-    def forward(
-        self, hidden_states: torch.FloatTensor, cls_index: Optional[torch.LongTensor] = None
-    ) -> torch.FloatTensor:
-        """
-        Compute a single vector summary of a sequence hidden states.
-
-        Args:
-            hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`):
-                The hidden states of the last layer.
-            cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
-                Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token.
-
-        Returns:
-            `torch.FloatTensor`: The summary of the sequence hidden states.
-        """
-        if self.summary_type == "last":
-            output = hidden_states[:, -1]
-        elif self.summary_type == "first":
-            output = hidden_states[:, 0]
-        elif self.summary_type == "mean":
-            output = hidden_states.mean(dim=1)
-        elif self.summary_type == "cls_index":
-            if cls_index is None:
-                cls_index = torch.full_like(
-                    hidden_states[..., :1, :],
-                    hidden_states.shape[-2] - 1,
-                    dtype=torch.long,
-                )
-            else:
-                cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)
-                cls_index = cls_index.expand((-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),))
-            # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
-            output = hidden_states.gather(-2, cls_index).squeeze(-2)  # shape (bsz, XX, hidden_size)
-        elif self.summary_type == "attn":
-            raise NotImplementedError
-
-        output = self.first_dropout(output)
-        output = self.summary(output)
-        output = self.activation(output)
-        output = self.last_dropout(output)
-
-        return output
-
-
 def unwrap_model(model: nn.Module, recursive: bool = False) -> nn.Module:
    """
    Recursively unwraps a model from potential containers (as used in distributed training).
--- a/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
+++ b/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
@ -206,7 +206,7 @@ def convert_audio_spectrogram_transformer_checkpoint(model_name, pytorch_dump_fo

    if "speech-commands" in model_name:
        # TODO: Convert dataset to Parquet
-        dataset = load_dataset("google/speech_commands", "v0.02", split="validation", trust_remote_code=True)
+        dataset = load_dataset("google/speech_commands", "v0.02", split="validation")
        waveform = dataset[0]["audio"]["array"]
    else:
        filepath = hf_hub_download(
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@ -245,6 +245,10 @@ TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
        ("gpt_neox_japanese", ("GPTNeoXJapaneseTokenizer", None)),
        ("gptj", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
        ("gptsan-japanese", ("GPTSanJapaneseTokenizer", None)),
+        ("granite", ("GPT2Tokenizer", None)),
+        ("granitemoe", ("GPT2Tokenizer", None)),
+        ("granitemoehybrid", ("GPT2Tokenizer", None)),
+        ("granitemoeshared", ("GPT2Tokenizer", None)),
        ("grounding-dino", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
        ("groupvit", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
        ("helium", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
--- a/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py
+++ b/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py
@ -266,7 +266,7 @@ def convert_beit_checkpoint(checkpoint_url, pytorch_dump_folder_path):
    # Check outputs on an image
    if is_semantic:
        image_processor = BeitImageProcessor(size=config.image_size, do_center_crop=False)
-        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
+        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
        image = Image.open(ds[0]["file"])
    else:
        image_processor = BeitImageProcessor(
--- a/src/transformers/models/clvp/number_normalizer.py
+++ b/src/transformers/models/clvp/number_normalizer.py
@ -15,7 +15,14 @@

 """English Normalizer class for CLVP."""

-import re
+import sys
+
+
+if sys.version_info >= (3, 11):
+    # Atomic grouping support was only added to the core RE in Python 3.11
+    import re
+else:
+    import regex as re


 class EnglishNormalizer:
@ -199,12 +206,12 @@ class EnglishNormalizer:
        This method is used to normalize numbers within a text such as converting the numbers to words, removing
        commas, etc.
        """
-        text = re.sub(re.compile(r"([0-9][0-9\,]+[0-9])"), self._remove_commas, text)
-        text = re.sub(re.compile(r"£([0-9\,]*[0-9]+)"), r"\1 pounds", text)
-        text = re.sub(re.compile(r"\$([0-9\.\,]*[0-9]+)"), self._expand_dollars, text)
-        text = re.sub(re.compile(r"([0-9]+\.[0-9]+)"), self._expand_decimal_point, text)
-        text = re.sub(re.compile(r"[0-9]+(st|nd|rd|th)"), self._expand_ordinal, text)
-        text = re.sub(re.compile(r"[0-9]+"), self._expand_number, text)
+        text = re.sub(r"([0-9][0-9,]+[0-9])", self._remove_commas, text)
+        text = re.sub(r"£([0-9,]*[0-9])", r"\1 pounds", text)
+        text = re.sub(r"\$([0-9.,]*[0-9])", self._expand_dollars, text)
+        text = re.sub(r"([0-9]++\.[0-9]+)", self._expand_decimal_point, text)
+        text = re.sub(r"[0-9]++(st|nd|rd|th)", self._expand_ordinal, text)
+        text = re.sub(r"[0-9]+", self._expand_number, text)
        return text

    def expand_abbreviations(self, text: str) -> str:
--- a/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
@ -226,7 +226,7 @@ def convert_wav2vec2_checkpoint(

    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-lv60")

-    ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
+    ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
    input_audio = [x["array"] for x in ds[:4]["audio"]]

    inputs = processor(input_audio, return_tensors="pt", padding=True)
--- a/src/transformers/models/layoutlm/modeling_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_layoutlm.py
@ -1223,7 +1223,7 @@ class LayoutLMForQuestionAnswering(LayoutLMPreTrainedModel):
        >>> tokenizer = AutoTokenizer.from_pretrained("impira/layoutlm-document-qa", add_prefix_space=True)
        >>> model = LayoutLMForQuestionAnswering.from_pretrained("impira/layoutlm-document-qa", revision="1e3ebac")

-        >>> dataset = load_dataset("nielsr/funsd", split="train", trust_remote_code=True)
+        >>> dataset = load_dataset("nielsr/funsd", split="train")
        >>> example = dataset[0]
        >>> question = "what's his name?"
        >>> words = example["words"]
--- a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
@ -1601,7 +1601,7 @@ class TFLayoutLMForQuestionAnswering(TFLayoutLMPreTrainedModel, TFQuestionAnswer
        >>> tokenizer = AutoTokenizer.from_pretrained("impira/layoutlm-document-qa", add_prefix_space=True)
        >>> model = TFLayoutLMForQuestionAnswering.from_pretrained("impira/layoutlm-document-qa", revision="1e3ebac")

-        >>> dataset = load_dataset("nielsr/funsd", split="train", trust_remote_code=True)
+        >>> dataset = load_dataset("nielsr/funsd", split="train")
        >>> example = dataset[0]
        >>> question = "what's his name?"
        >>> words = example["words"]
--- a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
@ -763,9 +763,8 @@ class LayoutLMv2Model(LayoutLMv2PreTrainedModel):
        >>> model = LayoutLMv2Model.from_pretrained("microsoft/layoutlmv2-base-uncased")


-        >>> dataset = load_dataset("hf-internal-testing/fixtures_docvqa", trust_remote_code=True)
-        >>> image_path = dataset["test"][0]["file"]
-        >>> image = Image.open(image_path).convert("RGB")
+        >>> dataset = load_dataset("hf-internal-testing/fixtures_docvqa")
+        >>> image = dataset["test"][0]["image"]

        >>> encoding = processor(image, return_tensors="pt")

@ -953,7 +952,7 @@ class LayoutLMv2ForSequenceClassification(LayoutLMv2PreTrainedModel):

        >>> set_seed(0)

-        >>> dataset = load_dataset("aharley/rvl_cdip", split="train", streaming=True, trust_remote_code=True)
+        >>> dataset = load_dataset("aharley/rvl_cdip", split="train", streaming=True)
        >>> data = next(iter(dataset))
        >>> image = data["image"].convert("RGB")

@ -1155,7 +1154,7 @@ class LayoutLMv2ForTokenClassification(LayoutLMv2PreTrainedModel):

        >>> set_seed(0)

-        >>> datasets = load_dataset("nielsr/funsd", split="test", trust_remote_code=True)
+        >>> datasets = load_dataset("nielsr/funsd", split="test")
        >>> labels = datasets.features["ner_tags"].feature.names
        >>> id2label = {v: k for v, k in enumerate(labels)}

@ -1312,9 +1311,8 @@ class LayoutLMv2ForQuestionAnswering(LayoutLMv2PreTrainedModel):
        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased")
        >>> model = LayoutLMv2ForQuestionAnswering.from_pretrained("microsoft/layoutlmv2-base-uncased")

-        >>> dataset = load_dataset("hf-internal-testing/fixtures_docvqa", trust_remote_code=True)
-        >>> image_path = dataset["test"][0]["file"]
-        >>> image = Image.open(image_path).convert("RGB")
+        >>> dataset = load_dataset("hf-internal-testing/fixtures_docvqa")
+        >>> image = dataset["test"][0]["image"]
        >>> question = "When is coffee break?"
        >>> encoding = processor(image, question, return_tensors="pt")

--- a/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py
@ -746,7 +746,7 @@ class LayoutLMv3Model(LayoutLMv3PreTrainedModel):
        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = AutoModel.from_pretrained("microsoft/layoutlmv3-base")

-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
@ -961,7 +961,7 @@ class LayoutLMv3ForTokenClassification(LayoutLMv3PreTrainedModel):
        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = AutoModelForTokenClassification.from_pretrained("microsoft/layoutlmv3-base", num_labels=7)

-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
@ -1062,7 +1062,7 @@ class LayoutLMv3ForQuestionAnswering(LayoutLMv3PreTrainedModel):
        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = AutoModelForQuestionAnswering.from_pretrained("microsoft/layoutlmv3-base")

-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> question = "what's his name?"
@ -1182,7 +1182,7 @@ class LayoutLMv3ForSequenceClassification(LayoutLMv3PreTrainedModel):
        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = AutoModelForSequenceClassification.from_pretrained("microsoft/layoutlmv3-base")

-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
--- a/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py
@ -1296,7 +1296,7 @@ class TFLayoutLMv3Model(TFLayoutLMv3PreTrainedModel):
        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = TFAutoModel.from_pretrained("microsoft/layoutlmv3-base")

-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
@ -1439,7 +1439,7 @@ class TFLayoutLMv3ForSequenceClassification(TFLayoutLMv3PreTrainedModel, TFSeque
        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = TFAutoModelForSequenceClassification.from_pretrained("microsoft/layoutlmv3-base")

-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
@ -1566,7 +1566,7 @@ class TFLayoutLMv3ForTokenClassification(TFLayoutLMv3PreTrainedModel, TFTokenCla
        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = TFAutoModelForTokenClassification.from_pretrained("microsoft/layoutlmv3-base", num_labels=7)

-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
@ -1703,7 +1703,7 @@ class TFLayoutLMv3ForQuestionAnswering(TFLayoutLMv3PreTrainedModel, TFQuestionAn
        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = TFAutoModelForQuestionAnswering.from_pretrained("microsoft/layoutlmv3-base")

-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> question = "what's his name?"
--- a/src/transformers/models/lilt/modeling_lilt.py
+++ b/src/transformers/models/lilt/modeling_lilt.py
@ -653,7 +653,7 @@ class LiltModel(LiltPreTrainedModel):
        >>> tokenizer = AutoTokenizer.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")
        >>> model = AutoModel.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")

-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]
@ -793,7 +793,7 @@ class LiltForSequenceClassification(LiltPreTrainedModel):
        >>> tokenizer = AutoTokenizer.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")
        >>> model = AutoModelForSequenceClassification.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")

-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]
@ -908,7 +908,7 @@ class LiltForTokenClassification(LiltPreTrainedModel):
        >>> tokenizer = AutoTokenizer.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")
        >>> model = AutoModelForTokenClassification.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")

-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]
@ -1025,7 +1025,7 @@ class LiltForQuestionAnswering(LiltPreTrainedModel):
        >>> tokenizer = AutoTokenizer.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")
        >>> model = AutoModelForQuestionAnswering.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")

-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]
--- a/src/transformers/models/speecht5/modeling_speecht5.py
+++ b/src/transformers/models/speecht5/modeling_speecht5.py
@ -2228,7 +2228,7 @@ class SpeechT5ForSpeechToText(SpeechT5PreTrainedModel, GenerationMixin):
        >>> from datasets import load_dataset

        >>> dataset = load_dataset(
-        ...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True
+        ...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation"
        ... )  # doctest: +IGNORE_RESULT
        >>> dataset = dataset.sort("id")
        >>> sampling_rate = dataset.features["audio"].sampling_rate
@ -2909,7 +2909,7 @@ class SpeechT5ForSpeechToSpeech(SpeechT5PreTrainedModel):
        >>> import torch

        >>> dataset = load_dataset(
-        ...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True
+        ...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation"
        ... )  # doctest: +IGNORE_RESULT
        >>> dataset = dataset.sort("id")
        >>> sampling_rate = dataset.features["audio"].sampling_rate
--- a/src/transformers/models/udop/modeling_udop.py
+++ b/src/transformers/models/udop/modeling_udop.py
@ -1604,7 +1604,7 @@ class UdopModel(UdopPreTrainedModel):

        >>> # load an example image, along with the words and coordinates
        >>> # which were extracted using an OCR engine
-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
@ -1813,7 +1813,7 @@ class UdopForConditionalGeneration(UdopPreTrainedModel, GenerationMixin):

        >>> # load an example image, along with the words and coordinates
        >>> # which were extracted using an OCR engine
-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
@ -2025,7 +2025,7 @@ class UdopEncoderModel(UdopPreTrainedModel):

        >>> # load an example image, along with the words and coordinates
        >>> # which were extracted using an OCR engine
-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
--- a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
@ -590,7 +590,7 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):
        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")

        >>> # load first sample of English common_voice
-        >>> dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", split="train", streaming=True, trust_remote_code=True)
+        >>> dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", split="train", streaming=True)
        >>> dataset = dataset.cast_column("audio", datasets.Audio(sampling_rate=16_000))
        >>> dataset_iter = iter(dataset)
        >>> sample = next(dataset_iter)
--- a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
+++ b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
@ -546,7 +546,7 @@ class Wav2Vec2ProcessorWithLM(ProcessorMixin):
        >>> processor = AutoProcessor.from_pretrained("patrickvonplaten/wav2vec2-base-100h-with-lm")

        >>> # load first sample of English common_voice
-        >>> dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", split="train", streaming=True, trust_remote_code=True)
+        >>> dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", split="train", streaming=True)
        >>> dataset = dataset.cast_column("audio", datasets.Audio(sampling_rate=16_000))
        >>> dataset_iter = iter(dataset)
        >>> sample = next(dataset_iter)
--- a/src/transformers/models/whisper/modeling_flax_whisper.py
+++ b/src/transformers/models/whisper/modeling_flax_whisper.py
@ -1670,7 +1670,7 @@ FLAX_WHISPER_AUDIO_CLASSIFICATION_DOCSTRING = r"""
    >>> model = FlaxWhisperForAudioClassification.from_pretrained(
    ...     "sanchit-gandhi/whisper-medium-fleurs-lang-id", from_pt=True
    ... )
-    >>> ds = load_dataset("google/fleurs", "all", split="validation", streaming=True, trust_remote_code=True)
+    >>> ds = load_dataset("google/fleurs", "all", split="validation", streaming=True)

    >>> sample = next(iter(ds))

--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@ -526,12 +526,15 @@ class Trainer:
            if is_liger_kernel_available():
                from liger_kernel.transformers import _apply_liger_kernel_to_instance

+                # Prepare kernel config - use provided config or default (empty dict for default behavior)
+                kernel_config = self.args.liger_kernel_config if self.args.liger_kernel_config is not None else {}
+
                if isinstance(model, PreTrainedModel):
-                    # Patch the model with liger kernels. Use the default kernel configurations.
-                    _apply_liger_kernel_to_instance(model=model)
+                    # Patch the model with liger kernels. Use the the specified or default kernel configurations.
+                    _apply_liger_kernel_to_instance(model=model, **kernel_config)
                elif hasattr(model, "get_base_model") and isinstance(model.get_base_model(), PreTrainedModel):
-                    # Patch the base model with liger kernels where model is a PeftModel. Use the default kernel configurations.
-                    _apply_liger_kernel_to_instance(model=model.get_base_model())
+                    # Patch the base model with liger kernels where model is a PeftModel. Use the specified or default kernel configurations.
+                    _apply_liger_kernel_to_instance(model=model.get_base_model(), **kernel_config)
                else:
                    logger.warning(
                        "The model is not an instance of PreTrainedModel. No liger kernels will be applied."
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@ -793,6 +793,11 @@ class TrainingArguments:
            It can effectively increase multi-GPU training throughput by ~20% and reduces memory usage by ~60%, works out of the box with
            flash attention, PyTorch FSDP, and Microsoft DeepSpeed. Currently, it supports llama, mistral, mixtral and gemma models.

+        liger_kernel_config (`Optional[dict]`, *optional*):
+            Configuration to be used for Liger Kernel. When use_liger_kernel=True, this dict is passed as keyword arguments to the
+            `_apply_liger_kernel_to_instance` function, which specifies which kernels to apply. Available options vary by model but typically
+            include: 'rope', 'swiglu', 'cross_entropy', 'fused_linear_cross_entropy', 'rms_norm', etc. If `None`, use the default kernel configurations.
+
        average_tokens_across_devices (`bool`, *optional*, defaults to `False`):
            Whether or not to average tokens across devices. If enabled, will use all_reduce to synchronize
            num_tokens_in_batch for precise loss calculation. Reference:
@ -1525,6 +1530,19 @@ class TrainingArguments:
        metadata={"help": "Whether or not to enable the Liger Kernel for model training."},
    )

+    liger_kernel_config: Optional[dict[str, bool]] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Configuration to be used for Liger Kernel. When use_liger_kernel=True, "
+                "this dict is passed as keyword arguments to the `_apply_liger_kernel_to_instance` function, "
+                "which specifies which kernels to apply. Available options vary by model "
+                "but typically include: 'rope', 'swiglu', 'cross_entropy', 'fused_linear_cross_entropy', "
+                "'rms_norm', etc. If None, use the default kernel configurations."
+            )
+        },
+    )
+
    eval_use_gather_object: Optional[bool] = field(
        default=False,
        metadata={
--- a/src/transformers/utils/doc.py
+++ b/src/transformers/utils/doc.py
@ -423,7 +423,7 @@ PT_SPEECH_BASE_MODEL_SAMPLE = r"""
    >>> import torch
    >>> from datasets import load_dataset

-    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
    >>> dataset = dataset.sort("id")
    >>> sampling_rate = dataset.features["audio"].sampling_rate

@ -449,7 +449,7 @@ PT_SPEECH_CTC_SAMPLE = r"""
    >>> from datasets import load_dataset
    >>> import torch

-    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
    >>> dataset = dataset.sort("id")
    >>> sampling_rate = dataset.features["audio"].sampling_rate

@ -484,7 +484,7 @@ PT_SPEECH_SEQ_CLASS_SAMPLE = r"""
    >>> from datasets import load_dataset
    >>> import torch

-    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
    >>> dataset = dataset.sort("id")
    >>> sampling_rate = dataset.features["audio"].sampling_rate

@ -520,7 +520,7 @@ PT_SPEECH_FRAME_CLASS_SAMPLE = r"""
    >>> from datasets import load_dataset
    >>> import torch

-    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
    >>> dataset = dataset.sort("id")
    >>> sampling_rate = dataset.features["audio"].sampling_rate

@ -549,7 +549,7 @@ PT_SPEECH_XVECTOR_SAMPLE = r"""
    >>> from datasets import load_dataset
    >>> import torch

-    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
    >>> dataset = dataset.sort("id")
    >>> sampling_rate = dataset.features["audio"].sampling_rate

@ -584,7 +584,7 @@ PT_VISION_BASE_MODEL_SAMPLE = r"""
    >>> import torch
    >>> from datasets import load_dataset

-    >>> dataset = load_dataset("huggingface/cats-image", trust_remote_code=True)
+    >>> dataset = load_dataset("huggingface/cats-image")
    >>> image = dataset["test"]["image"][0]

    >>> image_processor = AutoImageProcessor.from_pretrained("{checkpoint}")
@ -609,7 +609,7 @@ PT_VISION_SEQ_CLASS_SAMPLE = r"""
    >>> import torch
    >>> from datasets import load_dataset

-    >>> dataset = load_dataset("huggingface/cats-image", trust_remote_code=True)
+    >>> dataset = load_dataset("huggingface/cats-image")
    >>> image = dataset["test"]["image"][0]

    >>> image_processor = AutoImageProcessor.from_pretrained("{checkpoint}")
@ -1194,7 +1194,7 @@ TF_SPEECH_BASE_MODEL_SAMPLE = r"""
    >>> from transformers import AutoProcessor, {model_class}
    >>> from datasets import load_dataset

-    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
    >>> dataset = dataset.sort("id")
    >>> sampling_rate = dataset.features["audio"].sampling_rate

@ -1219,7 +1219,7 @@ TF_SPEECH_CTC_SAMPLE = r"""
    >>> from datasets import load_dataset
    >>> import tensorflow as tf

-    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
    >>> dataset = dataset.sort("id")
    >>> sampling_rate = dataset.features["audio"].sampling_rate

@ -1254,7 +1254,7 @@ TF_VISION_BASE_MODEL_SAMPLE = r"""
    >>> from transformers import AutoImageProcessor, {model_class}
    >>> from datasets import load_dataset

-    >>> dataset = load_dataset("huggingface/cats-image", trust_remote_code=True)
+    >>> dataset = load_dataset("huggingface/cats-image")
    >>> image = dataset["test"]["image"][0]

    >>> image_processor = AutoImageProcessor.from_pretrained("{checkpoint}")
@ -1277,7 +1277,7 @@ TF_VISION_SEQ_CLASS_SAMPLE = r"""
    >>> import tensorflow as tf
    >>> from datasets import load_dataset

-    >>> dataset = load_dataset("huggingface/cats-image", trust_remote_code=True)
+    >>> dataset = load_dataset("huggingface/cats-image"))
    >>> image = dataset["test"]["image"][0]

    >>> image_processor = AutoImageProcessor.from_pretrained("{checkpoint}")
--- a/tests/deepspeed/test_model_zoo.py
+++ b/tests/deepspeed/test_model_zoo.py
@ -269,7 +269,6 @@ def make_task_cmds():
        "img_clas": f"""
        {scripts_dir}/image-classification/run_image_classification.py
            --dataset_name hf-internal-testing/cats_vs_dogs_sample
-            --trust_remote_code
            --remove_unused_columns False
            --max_steps 10
            --image_processor_name {DS_TESTS_DIRECTORY}/vit_feature_extractor.json
--- a/tests/models/beit/test_image_processing_beit.py
+++ b/tests/models/beit/test_image_processing_beit.py
@ -27,8 +27,6 @@ if is_torch_available():
    import torch

 if is_vision_available():
-    from PIL import Image
-
    from transformers import BeitImageProcessor

    if is_torchvision_available():
@ -98,23 +96,14 @@ class BeitImageProcessingTester:


 def prepare_semantic_single_inputs():
-    dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-
-    image = Image.open(dataset[0]["file"])
-    map = Image.open(dataset[1]["file"])
-
-    return image, map
+    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+    example = ds[0]
+    return example["image"], example["map"]


 def prepare_semantic_batch_inputs():
-    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-
-    image1 = Image.open(ds[0]["file"])
-    map1 = Image.open(ds[1]["file"])
-    image2 = Image.open(ds[2]["file"])
-    map2 = Image.open(ds[3]["file"])
-
-    return [image1, image2], [map1, map2]
+    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+    return list(ds["image"][:2]), list(ds["map"][:2])


@require_torch
--- a/tests/models/beit/test_modeling_beit.py
+++ b/tests/models/beit/test_modeling_beit.py
@ -504,7 +504,7 @@ class BeitModelIntegrationTest(unittest.TestCase):

        image_processor = BeitImageProcessor(do_resize=True, size=640, do_center_crop=False)

-        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
+        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
        image = Image.open(ds[0]["file"])
        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)

@ -547,7 +547,7 @@ class BeitModelIntegrationTest(unittest.TestCase):

        image_processor = BeitImageProcessor(do_resize=True, size=640, do_center_crop=False)

-        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
+        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
        image = Image.open(ds[0]["file"])
        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)

--- a/tests/models/data2vec/test_modeling_data2vec_audio.py
+++ b/tests/models/data2vec/test_modeling_data2vec_audio.py
@ -669,7 +669,7 @@ class Data2VecAudioModelIntegrationTest(unittest.TestCase):
        return [x["array"] for x in speech_samples]

    def _load_superb(self, task, num_samples):
-        ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True)
+        ds = load_dataset("anton-l/superb_dummy", task, split="test")

        return ds[:num_samples]

--- a/tests/models/dpt/test_image_processing_dpt.py
+++ b/tests/models/dpt/test_image_processing_dpt.py
@ -29,8 +29,6 @@ if is_torch_available():
    import torch

 if is_vision_available():
-    from PIL import Image
-
    from transformers import DPTImageProcessor

    if is_torchvision_available():
@ -94,24 +92,15 @@ class DPTImageProcessingTester:

 # Copied from transformers.tests.models.beit.test_image_processing_beit.prepare_semantic_single_inputs
 def prepare_semantic_single_inputs():
-    dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-
-    image = Image.open(dataset[0]["file"])
-    map = Image.open(dataset[1]["file"])
-
-    return image, map
+    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+    example = ds[0]
+    return example["image"], example["map"]


 # Copied from transformers.tests.models.beit.test_image_processing_beit.prepare_semantic_batch_inputs
 def prepare_semantic_batch_inputs():
-    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-
-    image1 = Image.open(ds[0]["file"])
-    map1 = Image.open(ds[1]["file"])
-    image2 = Image.open(ds[2]["file"])
-    map2 = Image.open(ds[3]["file"])
-
-    return [image1, image2], [map1, map2]
+    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+    return list(ds["image"][:2]), list(ds["map"][:2])


@require_torch
--- a/tests/models/hubert/test_modeling_hubert.py
+++ b/tests/models/hubert/test_modeling_hubert.py
@ -767,7 +767,7 @@ class HubertModelIntegrationTest(unittest.TestCase):
    def _load_superb(self, task, num_samples):
        from datasets import load_dataset

-        ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True)
+        ds = load_dataset("anton-l/superb_dummy", task, split="test")

        return ds[:num_samples]

--- a/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py
@ -111,13 +111,13 @@ class LayoutLMv2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
    def test_layoutlmv2_integration_test(self):
        from datasets import load_dataset

-        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True)
+        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")

        for image_processing_class in self.image_processor_list:
            # with apply_OCR = True
            image_processing = image_processing_class()

-            image = Image.open(ds[0]["file"]).convert("RGB")
+            image = ds[0]["image"]

            encoding = image_processing(image, return_tensors="pt")

--- a/tests/models/layoutlmv2/test_processor_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_processor_layoutlmv2.py
@ -156,7 +156,7 @@ class LayoutLMv2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        from datasets import load_dataset

        # set up
-        datasets = load_dataset("nielsr/funsd", trust_remote_code=True)
+        datasets = load_dataset("nielsr/funsd")
        processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", revision="no_ocr")

        def preprocess_data(examples):
@ -192,12 +192,8 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
        # we verify our implementation on 2 document images from the DocVQA dataset
        from datasets import load_dataset

-        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True)
-
-        image_1 = Image.open(ds[0]["file"]).convert("RGB")
-        image_2 = Image.open(ds[1]["file"]).convert("RGB")
-
-        return image_1, image_2
+        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
+        return ds[0]["image"], ds[1]["image"]

    @cached_property
    def get_tokenizers(self):
--- a/tests/models/layoutlmv3/test_image_processing_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_image_processing_layoutlmv3.py
@ -22,8 +22,6 @@ from ...test_image_processing_common import ImageProcessingTestMixin, prepare_im


 if is_pytesseract_available():
-    from PIL import Image
-
    from transformers import LayoutLMv3ImageProcessor

    if is_torchvision_available():
@ -106,13 +104,13 @@ class LayoutLMv3ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
    def test_LayoutLMv3_integration_test(self):
        from datasets import load_dataset

-        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True)
+        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")

        # with apply_OCR = True
        for image_processing_class in self.image_processor_list:
            image_processor = image_processing_class()

-            image = Image.open(ds[0]["file"]).convert("RGB")
+            image = ds[0]["image"].convert("RGB")

            encoding = image_processor(image, return_tensors="pt")

--- a/tests/models/layoutlmv3/test_processor_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_processor_layoutlmv3.py
@ -28,8 +28,6 @@ from ...test_processing_common import ProcessorTesterMixin


 if is_pytesseract_available():
-    from PIL import Image
-
    from transformers import LayoutLMv3ImageProcessor


@ -172,12 +170,8 @@ class LayoutLMv3ProcessorIntegrationTests(unittest.TestCase):
        # we verify our implementation on 2 document images from the DocVQA dataset
        from datasets import load_dataset

-        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True)
-
-        image_1 = Image.open(ds[0]["file"]).convert("RGB")
-        image_2 = Image.open(ds[1]["file"]).convert("RGB")
-
-        return image_1, image_2
+        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
+        return ds[0]["image"], ds[1]["image"]

    @cached_property
    def get_tokenizers(self):
--- a/tests/models/layoutxlm/test_processor_layoutxlm.py
+++ b/tests/models/layoutxlm/test_processor_layoutxlm.py
@ -162,7 +162,7 @@ class LayoutXLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        from datasets import load_dataset

        # set up
-        datasets = load_dataset("nielsr/funsd", trust_remote_code=True)
+        datasets = load_dataset("nielsr/funsd")
        processor = LayoutXLMProcessor.from_pretrained("microsoft/layoutxlm-base", apply_ocr=False)

        def preprocess_data(examples):
@ -200,12 +200,8 @@ class LayoutXLMProcessorIntegrationTests(unittest.TestCase):
        # we verify our implementation on 2 document images from the DocVQA dataset
        from datasets import load_dataset

-        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True)
-
-        image_1 = Image.open(ds[0]["file"]).convert("RGB")
-        image_2 = Image.open(ds[1]["file"]).convert("RGB")
-
-        return image_1, image_2
+        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
+        return ds[0]["image"], ds[1]["image"]

    @cached_property
    def get_tokenizers(self):
--- a/tests/models/mobilevit/test_image_processing_mobilevit.py
+++ b/tests/models/mobilevit/test_image_processing_mobilevit.py
@ -27,8 +27,6 @@ if is_torch_available():
    import torch

 if is_vision_available():
-    from PIL import Image
-
    from transformers import MobileViTImageProcessor


@ -86,23 +84,14 @@ class MobileViTImageProcessingTester:


 def prepare_semantic_single_inputs():
-    dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-
-    image = Image.open(dataset[0]["file"])
-    map = Image.open(dataset[1]["file"])
-
-    return image, map
+    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+    example = ds[0]
+    return example["image"], example["map"]


 def prepare_semantic_batch_inputs():
-    dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-
-    image1 = Image.open(dataset[0]["file"])
-    map1 = Image.open(dataset[1]["file"])
-    image2 = Image.open(dataset[2]["file"])
-    map2 = Image.open(dataset[3]["file"])
-
-    return [image1, image2], [map1, map2]
+    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+    return list(ds["image"][:2]), list(ds["map"][:2])


@require_torch
--- a/tests/models/nougat/test_image_processing_nougat.py
+++ b/tests/models/nougat/test_image_processing_nougat.py
@ -86,8 +86,12 @@ class NougatImageProcessingTester:
        return self.num_channels, self.size["height"], self.size["width"]

    def prepare_dummy_image(self):
+        revision = "ec57bf8c8b1653a209c13f6e9ee66b12df0fc2db"
        filepath = hf_hub_download(
-            repo_id="hf-internal-testing/fixtures_docvqa", filename="nougat_pdf.png", repo_type="dataset"
+            repo_id="hf-internal-testing/fixtures_docvqa",
+            filename="nougat_pdf.png",
+            repo_type="dataset",
+            revision=revision,
        )
        image = Image.open(filepath).convert("RGB")
        return image
@ -179,8 +183,12 @@ class NougatImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
        self.assertEqual((3, 100, 200), aligned_image.shape)

    def prepare_dummy_np_image(self):
+        revision = "ec57bf8c8b1653a209c13f6e9ee66b12df0fc2db"
        filepath = hf_hub_download(
-            repo_id="hf-internal-testing/fixtures_docvqa", filename="nougat_pdf.png", repo_type="dataset"
+            repo_id="hf-internal-testing/fixtures_docvqa",
+            filename="nougat_pdf.png",
+            repo_type="dataset",
+            revision=revision,
        )
        image = Image.open(filepath).convert("RGB")
        return np.array(image)
--- a/tests/models/perceiver/test_modeling_perceiver.py
+++ b/tests/models/perceiver/test_modeling_perceiver.py
@ -842,11 +842,8 @@ def prepare_img():

 # Helper functions for optical flow integration test
 def prepare_optical_flow_images():
-    dataset = load_dataset("hf-internal-testing/fixtures_sintel", split="test", trust_remote_code=True)
-    image1 = Image.open(dataset[0]["file"]).convert("RGB")
-    image2 = Image.open(dataset[0]["file"]).convert("RGB")
-
-    return image1, image2
+    ds = load_dataset("hf-internal-testing/fixtures_sintel", split="test")
+    return list(ds["image"][:2])


 def normalize(img):
--- a/tests/models/segformer/test_image_processing_segformer.py
+++ b/tests/models/segformer/test_image_processing_segformer.py
@ -27,8 +27,6 @@ if is_torch_available():
    import torch

 if is_vision_available():
-    from PIL import Image
-
    from transformers import SegformerImageProcessor


@ -86,23 +84,14 @@ class SegformerImageProcessingTester:


 def prepare_semantic_single_inputs():
-    dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-
-    image = Image.open(dataset[0]["file"])
-    map = Image.open(dataset[1]["file"])
-
-    return image, map
+    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+    example = ds[0]
+    return example["image"], example["map"]


 def prepare_semantic_batch_inputs():
-    dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-
-    image1 = Image.open(dataset[0]["file"])
-    map1 = Image.open(dataset[1]["file"])
-    image2 = Image.open(dataset[2]["file"])
-    map2 = Image.open(dataset[3]["file"])
-
-    return [image1, image2], [map1, map2]
+    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+    return list(ds["image"][:2]), list(ds["map"][:2])


@require_torch
--- a/tests/models/udop/test_processor_udop.py
+++ b/tests/models/udop/test_processor_udop.py
@ -184,7 +184,7 @@ class UdopProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        from datasets import load_dataset

        # set up
-        datasets = load_dataset("nielsr/funsd", trust_remote_code=True)
+        datasets = load_dataset("nielsr/funsd")
        processor = UdopProcessor.from_pretrained("microsoft/udop-large", apply_ocr=False)

        def preprocess_data(examples):
@ -222,12 +222,8 @@ class UdopProcessorIntegrationTests(unittest.TestCase):
        # we verify our implementation on 2 document images from the DocVQA dataset
        from datasets import load_dataset

-        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True)
-
-        image_1 = Image.open(ds[0]["file"]).convert("RGB")
-        image_2 = Image.open(ds[1]["file"]).convert("RGB")
-
-        return image_1, image_2
+        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
+        return ds[0]["image"], ds[1]["image"]

    @cached_property
    def get_tokenizers(self):
--- a/tests/models/unispeech/test_modeling_unispeech.py
+++ b/tests/models/unispeech/test_modeling_unispeech.py
@ -566,7 +566,7 @@ class UniSpeechModelIntegrationTest(unittest.TestCase):
        return [x["array"] for x in speech_samples]

    def _load_superb(self, task, num_samples):
-        ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True)
+        ds = load_dataset("anton-l/superb_dummy", task, split="test")

        return ds[:num_samples]

--- a/tests/models/unispeech_sat/test_modeling_unispeech_sat.py
+++ b/tests/models/unispeech_sat/test_modeling_unispeech_sat.py
@ -820,7 +820,7 @@ class UniSpeechSatModelIntegrationTest(unittest.TestCase):
        return [x["array"] for x in speech_samples]

    def _load_superb(self, task, num_samples):
-        ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True)
+        ds = load_dataset("anton-l/superb_dummy", task, split="test")

        return ds[:num_samples]

--- a/tests/models/vilt/test_modeling_vilt.py
+++ b/tests/models/vilt/test_modeling_vilt.py
@ -637,9 +637,9 @@ class ViltModelIntegrationTest(unittest.TestCase):

        processor = self.default_processor

-        dataset = load_dataset("hf-internal-testing/fixtures_nlvr2", split="test", trust_remote_code=True)
-        image1 = Image.open(dataset[0]["file"]).convert("RGB")
-        image2 = Image.open(dataset[1]["file"]).convert("RGB")
+        dataset = load_dataset("hf-internal-testing/fixtures_nlvr2", split="train")
+        image1 = dataset[0]["image"]
+        image2 = dataset[1]["image"]

        text = (
            "The left image contains twice the number of dogs as the right image, and at least two dogs in total are"
--- a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
+++ b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
@ -1149,8 +1149,8 @@ class TrOCRModelIntegrationTest(unittest.TestCase):
    def test_inference_handwritten(self):
        model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten").to(torch_device)

-        dataset = load_dataset("hf-internal-testing/fixtures_ocr", split="test", trust_remote_code=True)
-        image = Image.open(dataset[0]["file"]).convert("RGB")
+        dataset = load_dataset("hf-internal-testing/fixtures_ocr", split="train")
+        image = dataset[0]["image"]

        processor = self.default_processor
        pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(torch_device)
@ -1174,8 +1174,8 @@ class TrOCRModelIntegrationTest(unittest.TestCase):
    def test_inference_printed(self):
        model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed").to(torch_device)

-        dataset = load_dataset("hf-internal-testing/fixtures_ocr", split="test", trust_remote_code=True)
-        image = Image.open(dataset[1]["file"]).convert("RGB")
+        dataset = load_dataset("hf-internal-testing/fixtures_ocr", split="test")
+        image = dataset[0]["image"]

        processor = self.default_processor
        pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(torch_device)
--- a/tests/models/wav2vec2/test_modeling_wav2vec2.py
+++ b/tests/models/wav2vec2/test_modeling_wav2vec2.py
@ -97,9 +97,7 @@ def _test_wav2vec2_with_lm_invalid_pool(in_queue, out_queue, timeout):
    try:
        _ = in_queue.get(timeout=timeout)

-        ds = load_dataset(
-            "mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True, trust_remote_code=True
-        )
+        ds = load_dataset("mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True)
        sample = next(iter(ds))

        resampled_audio = torchaudio.functional.resample(
@ -1470,7 +1468,7 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
        return [x["array"] for x in speech_samples]

    def _load_superb(self, task, num_samples):
-        ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True)
+        ds = load_dataset("anton-l/superb_dummy", task, split="test")

        return ds[:num_samples]

@ -1836,9 +1834,7 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
    @require_pyctcdecode
    @require_torchaudio
    def test_wav2vec2_with_lm(self):
-        ds = load_dataset(
-            "mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True, trust_remote_code=True
-        )
+        ds = load_dataset("mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True)
        sample = next(iter(ds))

        resampled_audio = torchaudio.functional.resample(
@ -1862,9 +1858,7 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
    @require_pyctcdecode
    @require_torchaudio
    def test_wav2vec2_with_lm_pool(self):
-        ds = load_dataset(
-            "mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True, trust_remote_code=True
-        )
+        ds = load_dataset("mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True)
        sample = next(iter(ds))

        resampled_audio = torchaudio.functional.resample(
@ -1963,9 +1957,7 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
        LANG_MAP = {"it": "ita", "es": "spa", "fr": "fra", "en": "eng"}

        def run_model(lang):
-            ds = load_dataset(
-                "mozilla-foundation/common_voice_11_0", lang, split="test", streaming=True, trust_remote_code=True
-            )
+            ds = load_dataset("mozilla-foundation/common_voice_11_0", lang, split="test", streaming=True)
            sample = next(iter(ds))

            wav2vec2_lang = LANG_MAP[lang]
--- a/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
+++ b/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
@ -463,9 +463,7 @@ class Wav2Vec2ProcessorWithLMTest(unittest.TestCase):
    def test_word_time_stamp_integration(self):
        import torch

-        ds = load_dataset(
-            "mozilla-foundation/common_voice_11_0", "en", split="train", streaming=True, trust_remote_code=True
-        )
+        ds = load_dataset("mozilla-foundation/common_voice_11_0", "en", split="train", streaming=True)
        ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000))
        ds_iter = iter(ds)
        sample = next(ds_iter)
--- a/tests/models/wavlm/test_modeling_wavlm.py
+++ b/tests/models/wavlm/test_modeling_wavlm.py
@ -473,7 +473,7 @@ class WavLMModelIntegrationTest(unittest.TestCase):
        return [x["array"] for x in speech_samples]

    def _load_superb(self, task, num_samples):
-        ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True)
+        ds = load_dataset("anton-l/superb_dummy", task, split="test")

        return ds[:num_samples]

--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@ -1645,9 +1645,7 @@ class WhisperModelIntegrationTests(unittest.TestCase):
        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3")
        model.to(torch_device)

-        ds = load_dataset(
-            "facebook/multilingual_librispeech", "german", split="test", streaming=True, trust_remote_code=True
-        )
+        ds = load_dataset("facebook/multilingual_librispeech", "german", split="test", streaming=True)
        ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000))

        input_speech = next(iter(ds))["audio"]["array"]
@ -1714,11 +1712,10 @@ class WhisperModelIntegrationTests(unittest.TestCase):

        token = os.getenv("HF_HUB_READ_TOKEN", True)
        ds = load_dataset(
-            "mozilla-foundation/common_voice_6_1",
+            "hf-internal-testing/fixtures_common_voice",
            "ja",
            split="test",
            streaming=True,
-            trust_remote_code=True,
            token=token,
        )
        ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000))
--- a/tests/pipelines/test_pipelines_audio_classification.py
+++ b/tests/pipelines/test_pipelines_audio_classification.py
@ -179,7 +179,7 @@ class AudioClassificationPipelineTests(unittest.TestCase):
        model = "superb/wav2vec2-base-superb-ks"

        audio_classifier = pipeline("audio-classification", model=model)
-        dataset = datasets.load_dataset("anton-l/superb_dummy", "ks", split="test", trust_remote_code=True)
+        dataset = datasets.load_dataset("anton-l/superb_dummy", "ks", split="test")

        audio = np.array(dataset[3]["speech"], dtype=np.float32)
        output = audio_classifier(audio, top_k=4)
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@ -265,9 +265,7 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
    @require_torch
    @require_pyctcdecode
    def test_large_model_pt_with_lm(self):
-        dataset = load_dataset("Narsil/asr_dummy", streaming=True, trust_remote_code=True)
-        third_item = next(iter(dataset["test"].skip(3)))
-        filename = third_item["file"]
+        filename = hf_hub_download("Narsil/asr_dummy", filename="4.flac", repo_type="dataset")

        speech_recognizer = pipeline(
            task="automatic-speech-recognition",
@ -388,7 +386,7 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
            chunk_length_s=8,
            stride_length_s=1,
        )
-        data = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True, trust_remote_code=True)
+        data = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True)
        sample = next(iter(data))

        res = pipe(sample["audio"]["array"])
@ -434,7 +432,7 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
            stride_length_s=1,
            return_language=True,
        )
-        data = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True, trust_remote_code=True)
+        data = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True)
        sample = next(iter(data))

        res = pipe(sample["audio"]["array"])
@ -489,7 +487,7 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
            task="automatic-speech-recognition",
            model="openai/whisper-tiny.en",
        )
-        data = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True, trust_remote_code=True)
+        data = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True)
        samples = [next(iter(data)) for _ in range(8)]
        audio = np.concatenate([sample["audio"]["array"] for sample in samples])

@ -1125,9 +1123,7 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
    @slow
    def test_speculative_decoding_whisper_non_distil(self):
        # Load data:
-        dataset = load_dataset(
-            "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]", trust_remote_code=True
-        )
+        dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]")
        sample = dataset[0]["audio"]

        # Load model:
@ -1169,9 +1165,7 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
    @slow
    def test_speculative_decoding_whisper_distil(self):
        # Load data:
-        dataset = load_dataset(
-            "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]", trust_remote_code=True
-        )
+        dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]")
        sample = dataset[0]["audio"]

        # Load model:
--- a/tests/pipelines/test_pipelines_image_segmentation.py
+++ b/tests/pipelines/test_pipelines_image_segmentation.py
@ -601,7 +601,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase):

        image_segmenter = pipeline("image-segmentation", model=model, image_processor=image_processor)

-        image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
+        image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
        file = image[0]["file"]
        outputs = image_segmenter(file, threshold=threshold)

@ -655,7 +655,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase):
    def test_oneformer(self):
        image_segmenter = pipeline(model="shi-labs/oneformer_ade20k_swin_tiny")

-        image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
+        image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
        file = image[0]["file"]
        outputs = image_segmenter(file, threshold=0.99)
        # Shortening by hashing
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@ -3799,8 +3799,20 @@ class ModelTesterMixin:
                self.skipTest(reason="Idefics currently (transformers==4.39.1) requires an image_attention_mask input")
            if config.model_type in ["sam"]:
                self.skipTest(reason="SAM requires an attention_mask input for relative positional embeddings")
+
            model = model_class(config)

+            sub_models_supporting_sdpa = [
+                module._supports_sdpa
+                for name, module in model.named_modules()
+                if isinstance(module, PreTrainedModel) and name != ""
+            ]
+            supports_sdpa_all_modules = (
+                all(sub_models_supporting_sdpa) if len(sub_models_supporting_sdpa) > 0 else model._supports_sdpa
+            )
+            if not supports_sdpa_all_modules:
+                self.skipTest(reason="This models' submodels does not support sdpa")
+
            with tempfile.TemporaryDirectory() as tmpdirname:
                model.save_pretrained(tmpdirname)
                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, attn_implementation="sdpa")
@ -3848,8 +3860,20 @@ class ModelTesterMixin:
                    "Cannot compile forward without an existing cache with Hybrid, as `torch._dynamo.mark_static_address` "
                    "is a forbidden call."
                )
+
            model = model_class(config)

+            sub_models_supporting_sdpa = [
+                module._supports_sdpa
+                for name, module in model.named_modules()
+                if isinstance(module, PreTrainedModel) and name != ""
+            ]
+            supports_sdpa_all_modules = (
+                all(sub_models_supporting_sdpa) if len(sub_models_supporting_sdpa) > 0 else model._supports_sdpa
+            )
+            if not supports_sdpa_all_modules:
+                self.skipTest(reason="This models' submodels does not support sdpa")
+
            with tempfile.TemporaryDirectory() as tmpdirname:
                model.save_pretrained(tmpdirname)
                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, attn_implementation="sdpa")
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@ -1792,6 +1792,25 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
            self.assertEqual(modeling_llama.apply_rotary_pos_emb, liger_rotary_pos_emb)
            self.assertTrue(isinstance(tiny_llama.model.norm, LigerRMSNorm))

+    @require_liger_kernel
+    def test_use_liger_kernel_custom_config_patching(self):
+        # Ensure any monkey patching is cleaned up for subsequent tests
+        with patch("transformers.models.llama.modeling_llama"):
+            from liger_kernel.transformers import LigerRMSNorm
+
+            config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4)
+            tiny_llama = LlamaForCausalLM(config)
+
+            args = TrainingArguments(
+                self.get_auto_remove_tmp_dir(),
+                use_liger_kernel=True,
+                liger_kernel_config={"rms_norm": False},  # Don't apply Liger's RMSNorm
+            )
+            Trainer(tiny_llama, args)
+
+            # Check that the RMSNorm kernel is not applied as specified in the config
+            self.assertFalse(isinstance(tiny_llama.model.norm, LigerRMSNorm))
+
    @require_liger_kernel
    @require_torch_accelerator
    def test_use_liger_kernel_trainer(self):
@ -1810,6 +1829,29 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        # Check this works
        _ = trainer.train()

+    @require_liger_kernel
+    @require_torch_accelerator
+    def test_use_liger_kernel_custom_config_trainer(self):
+        # Check that trainer still works with liger kernel applied when using a custom config
+        config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4)
+        tiny_llama = LlamaForCausalLM(config)
+
+        x = torch.randint(0, 100, (128,))
+        train_dataset = RepeatDataset(x)
+
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=1e-2,
+            logging_steps=5,
+            max_steps=20,
+            use_liger_kernel=True,
+            liger_kernel_config={"rms_norm": False, "cross_entropy": True, "fused_linear_cross_entropy": False},
+        )
+        trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
+
+        # Check this works
+        _ = trainer.train()
+
    @require_lomo
    @require_torch_accelerator
    def test_lomo(self):
--- a/utils/check_modular_conversion.py
+++ b/utils/check_modular_conversion.py
@ -133,10 +133,19 @@ if __name__ == "__main__":
    # Assuming there is a topological sort on the dependency mapping: if the file being checked and its dependencies
    # are not in the diff, then there it is guaranteed to have no differences. If no models are in the diff, then this
    # script will do nothing.
-    models_in_diff = get_models_in_diff()
-    if not models_in_diff:
-        console.print("[bold green]No models files or model tests in the diff, skipping modular checks[/bold green]")
-        exit(0)
+    current_branch = subprocess.check_output(["git", "branch", "--show-current"], text=True).strip()
+    if current_branch == "main":
+        console.print(
+            "[bold red]You are developing on the main branch. We cannot identify the list of changed files and will have to check all files. This may take a while.[/bold red]"
+        )
+        models_in_diff = {file_path.split("/")[-2] for file_path in args.files}
+    else:
+        models_in_diff = get_models_in_diff()
+        if not models_in_diff:
+            console.print(
+                "[bold green]No models files or model tests in the diff, skipping modular checks[/bold green]"
+            )
+            exit(0)

    skipped_models = set()
    non_matching_files = 0
@ -149,7 +158,8 @@ if __name__ == "__main__":
                skipped_models.add(model_name)
                continue
            non_matching_files += compare_files(modular_file_path, args.fix_and_overwrite)
-            models_in_diff = get_models_in_diff()  # When overwriting, the diff changes
+            if current_branch != "main":
+                models_in_diff = get_models_in_diff()  # When overwriting, the diff changes
    else:
        new_ordered_files = []
        for modular_file_path in ordered_files:
--- a/utils/get_runner_map.py
+++ b/utils/get_runner_map.py
@ -1,59 +0,0 @@
-# Copyright 2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-This script is used to get a map containing the information of runners to use in GitHub Actions workflow files.
-This is meant to be a temporary file that helps us to switch progressively from T4 to A10 runners.
-
-The data is stored in a Hub repository [hf-internal-testing/transformers_daily_ci](https://huggingface.co/datasets/hf-internal-testing/transformers_daily_ci/blob/main/runner_map.json).
-Currently, in that file, we specify the models for which we want to run the tests with T4 runners to avoid many test failures showing on the CI reports.
-We will work on the tests toward to use A10 for all CI jobs.
-"""
-
-import os
-import requests
-
-
-if __name__ == "__main__":
-
-    # T4
-    t4_runners = {
-        "single-gpu": "aws-g4dn-4xlarge-cache",
-        "multi-gpu": "aws-g4dn-12xlarge-cache",
-    }
-
-    # A10
-    a10_runners = {
-        "single-gpu": "aws-g5-4xlarge-cache",
-        "multi-gpu": "aws-g5-12xlarge-cache",
-    }
-
-    tests = os.getcwd()
-    model_tests = os.listdir(os.path.join(tests, "models"))
-    d1 = sorted(filter(os.path.isdir, os.listdir(tests)))
-    d2 = sorted(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))
-    d1.remove("models")
-    d = d2 + d1
-
-    response = requests.get("https://huggingface.co/datasets/hf-internal-testing/transformers_daily_ci/resolve/main/runner_map.json")
-    # The models that we want to run with T4 runners
-    runner_map = response.json()
-
-    for key in d:
-        if key in runner_map:
-            runner_map[key] = t4_runners
-        else:
-            runner_map[key] = a10_runners
-
-    print(runner_map)
--- a/utils/notification_service.py
+++ b/utils/notification_service.py
@ -1494,7 +1494,7 @@ if __name__ == "__main__":
        other_ci_artifacts=other_ci_artifacts,
    )

-    # # send report only if there is any failure (for push CI)
-    # if message.n_failures or (ci_event != "push" and not ci_event.startswith("Push CI (AMD)")):
-    #     message.post()
-    #     message.post_reply()
+    # send report only if there is any failure (for push CI)
+    if message.n_failures or (ci_event != "push" and not ci_event.startswith("Push CI (AMD)")):
+        message.post()
+        message.post_reply()
--- a/utils/split_model_tests.py
+++ b/utils/split_model_tests.py
@ -62,5 +62,4 @@ if __name__ == "__main__":
        start = end
        end = start + num_jobs_per_splits + (1 if idx < num_jobs % args.num_splits else 0)
        model_splits.append(d[start:end])
-    model_splits = [['models/vit', 'generation'], ['models/clip', 'models/vits']]
    print(model_splits)
Author	SHA1	Message	Date
ydshieh	6d38d27ef3	trigger for Remove script datasets in tests #38940	2025-06-20 20:12:47 +02:00
ydshieh	20c0f8bc77	pick	2025-06-20 20:10:35 +02:00
Quentin Lhoest	9b2afaf02d	fix integration test	2025-06-20 19:54:27 +02:00
Quentin Lhoest	d188134b95	style	2025-06-20 17:43:26 +02:00
Quentin Lhoest	e2ed15c465	again	2025-06-20 17:42:17 +02:00
Quentin Lhoest	005459827e	again	2025-06-20 14:44:38 +02:00
Quentin Lhoest	69419a4935	style	2025-06-20 14:37:26 +02:00
Quentin Lhoest	1fdb9f3908	again	2025-06-20 13:19:50 +02:00
Quentin Lhoest	3dfebf2fc0	Revert "Skip some tests for now (#38931 )" This reverts commit 31d30b72245aacfdf70249165964b53790d9c4d8.	2025-06-20 13:00:47 +02:00
Quentin Lhoest	e6093deb18	again	2025-06-20 13:00:25 +02:00
Quentin Lhoest	b7ec09c2f4	remove trust_remote_code	2025-06-20 13:00:25 +02:00
Cyril Vallez	aa42987c1e	Remove `ALL_LAYERNORM_LAYERS` (#38922 ) * remove it everywhere * Update trainer_pt_utils.py * Update trainer_pt_utils.py * style * sort list in test * CIs * use recursion same way as before (for intermediate layer names)	2025-06-20 12:06:48 +02:00
Yao Matrix	38a9b70786	add pytorch-xpu Dockerfile (#38875 ) * first commit Signed-off-by: YAO Matrix <matrix.yao@intel.com> * use rls pytorch Signed-off-by: YAO Matrix <matrix.yao@intel.com> --------- Signed-off-by: YAO Matrix <matrix.yao@intel.com>	2025-06-20 11:42:44 +02:00
Rémi Ouazan	9bcdd5cde9	Modernbert fixes (#38912 ) * Removed deprecated argument in modernbert RotaryEmbedding * Skip test_sdpa_can_dispatch_on_flash for modernbert --------- Co-authored-by: ivarflakstad <69173633+ivarflakstad@users.noreply.github.com> Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>	2025-06-20 11:22:32 +02:00
Yih-Dar	31d30b7224	Skip some tests for now (#38931 ) * try * [test all] --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>	2025-06-20 11:05:49 +02:00
Cyril Vallez	0725cd6953	Remove deprecated classes in modeling_utils.py (#38919 ) * remove deprecated classes * style	2025-06-19 19:25:20 +02:00
Hamza Benchekroun	797860c68c	feat: add flexible Liger Kernel configuration to TrainingArguments (#38911 ) * feat: add flexible Liger Kernel configuration to TrainingArguments Add support for granular Liger Kernel configuration through a new `liger_kernel_config` parameter in TrainingArguments. This allows users to selectively enable/disable specific kernels (rope, swiglu, cross_entropy, etc.) instead of the current approach that rely on default configuration. Features: - Add `liger_kernel_config` dict parameter to TrainingArguments - Support selective kernel application for all supported models - Maintain full backward compatibility with existing `use_liger_kernel` flag Example usage: ```python TrainingArguments( use_liger_kernel=True, liger_kernel_config={ "rope": True, "swiglu": True, "cross_entropy": False, "fused_linear_cross_entropy": True } ) Closes #38905 * Address comments and update Liger section in Trainer docs	2025-06-19 15:54:08 +00:00
Matt	89b35be618	Allow make-fixup on main branch, albeit slowly (#38892 ) * Allow make-fixup on main branch, albeit slowly * Make the other style checks work correctly on main too * More update * More makefile update	2025-06-19 15:22:59 +01:00
Gabe Goodhart	9a02e7602d	feat: Add granite architectures to auto tokenizer name mappings (#38802 ) Branch: GraniteTokenizerMapping Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>	2025-06-19 15:20:42 +01:00
Matt	54a02160eb	Fix ReDOS in tokenizer digit substitution (#38844 ) * Fix regexes vulnerable to ReDOS * Let's just use regex * Import regex/re correctly	2025-06-19 14:53:52 +01:00
ivarflakstad	af6120b3eb	Skip sdpa tests if submodule does not support sdpa (#38907 )	2025-06-19 13:11:01 +00:00