install ffmpeg and torchcodec in ci

add torchcodec to torch tests requirements
fix import
2025-10-21 09:44:02 +08:00 · 2025-07-01 18:11:26 +02:00 · 2025-07-01 17:22:52 +02:00 · 2025-06-27 15:59:32 +02:00 · 2025-06-26 18:16:29 +02:00 · 2025-06-26 18:02:42 +02:00
90 changed files with 381 additions and 450 deletions
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@ -156,6 +156,7 @@ class CircleCIJob:
            {"attach_workspace": {"at": "test_preparation"}},
            {"run": "apt-get update && apt-get install -y curl"},
            {"run": " && ".join(self.install_steps)},
+            {"run": {"name": "Install `datasets@main`", "command": 'pip uninstall datasets -y && pip install "datasets @ git+https://github.com/huggingface/datasets@main#egg=datasets"'}},
            {"run": {"name": "Download NLTK files", "command": """python -c "import nltk; nltk.download('punkt', quiet=True)" """} if "example" in self.name else "echo Skipping"},
            {"run": {
                    "name": "Show installed libraries and their size",
--- a/docker/examples-torch.dockerfile
+++ b/docker/examples-torch.dockerfile
@ -2,10 +2,10 @@ FROM python:3.9-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 ARG REF=main
 USER root
-RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git
+RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git ffmpeg
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]" seqeval albumentations jiwer
 RUN uv pip uninstall transformers
--- a/docker/pipeline-torch.dockerfile
+++ b/docker/pipeline-torch.dockerfile
@ -2,10 +2,10 @@ FROM python:3.9-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 ARG REF=main
 USER root
-RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git pkg-config openssh-client git
+RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git pkg-config openssh-client git ffmpeg
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]"
 RUN uv pip uninstall transformers
--- a/docker/torch-light.dockerfile
+++ b/docker/torch-light.dockerfile
@ -2,10 +2,10 @@ FROM python:3.9-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 ARG REF=main
 USER root
-RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git git-lfs
+RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git git-lfs ffmpeg
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing,tiktoken,num2words,video]"
 RUN uv pip uninstall transformers
--- a/docs/source/en/model_doc/seamless_m4t.md
+++ b/docs/source/en/model_doc/seamless_m4t.md
@ -56,7 +56,7 @@ Here is how to use the processor to process text and audio:
 ```python
 >>> # let's load an audio sample from an Arabic speech corpus
 >>> from datasets import load_dataset
->>> dataset = load_dataset("arabic_speech_corpus", split="test", streaming=True, trust_remote_code=True)
+>>> dataset = load_dataset("halabi2016/arabic_speech_corpus", split="test", streaming=True)
 >>> audio_sample = next(iter(dataset))["audio"]

 >>> # now, process it
--- a/docs/source/en/model_doc/seamless_m4t_v2.md
+++ b/docs/source/en/model_doc/seamless_m4t_v2.md
@ -56,7 +56,7 @@ Here is how to use the processor to process text and audio:
 ```python
 >>> # let's load an audio sample from an Arabic speech corpus
 >>> from datasets import load_dataset
->>> dataset = load_dataset("arabic_speech_corpus", split="test", streaming=True, trust_remote_code=True)
+>>> dataset = load_dataset("halabi2016/arabic_speech_corpus", split="test", streaming=True)
 >>> audio_sample = next(iter(dataset))["audio"]

 >>> # now, process it
--- a/docs/source/en/model_doc/speech_to_text_2.md
+++ b/docs/source/en/model_doc/speech_to_text_2.md
@ -64,15 +64,15 @@ predicted token ids.
 >>> import torch
 >>> from transformers import Speech2Text2Processor, SpeechEncoderDecoderModel
 >>> from datasets import load_dataset
->>> import soundfile as sf
+>>> from torchcodec.decoders import AudioDecoder

 >>> model = SpeechEncoderDecoderModel.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
 >>> processor = Speech2Text2Processor.from_pretrained("facebook/s2t-wav2vec2-large-en-de")


 >>> def map_to_array(batch):
-...     speech, _ = sf.read(batch["file"])
-...     batch["speech"] = speech
+...     decoder = AudioDecoder(batch["file"])
+...     batch["speech"] = decoder.get_all_samples().data
 ...     return batch


--- a/examples/flax/test_flax_examples.py
+++ b/examples/flax/test_flax_examples.py
@ -264,7 +264,6 @@ class ExamplesTests(TestCasePlus):
            --dataset_config clean
            --train_split_name validation
            --eval_split_name validation
-            --trust_remote_code
            --output_dir {tmp_dir}
            --overwrite_output_dir
            --num_train_epochs=2
--- a/examples/pytorch/_tests_requirements.txt
+++ b/examples/pytorch/_tests_requirements.txt
@ -22,6 +22,7 @@ protobuf
 torch
 torchvision
 torchaudio
+torchcodec
 jiwer
 librosa
 evaluate >= 0.2.0
--- a/examples/pytorch/test_accelerate_examples.py
+++ b/examples/pytorch/test_accelerate_examples.py
@ -312,7 +312,6 @@ class ExamplesTestsNoTrainer(TestCasePlus):
            {self.examples_dir}/pytorch/image-classification/run_image_classification_no_trainer.py
            --model_name_or_path google/vit-base-patch16-224-in21k
            --dataset_name hf-internal-testing/cats_vs_dogs_sample
-            --trust_remote_code
            --learning_rate 1e-4
            --per_device_train_batch_size 2
            --per_device_eval_batch_size 1
--- a/examples/pytorch/test_pytorch_examples.py
+++ b/examples/pytorch/test_pytorch_examples.py
@ -17,7 +17,6 @@ import json
 import logging
 import os
 import sys
-import unittest
 from unittest.mock import patch

 from transformers import ViTMAEForPreTraining, Wav2Vec2ForPreTraining
@ -391,7 +390,6 @@ class ExamplesTests(TestCasePlus):
            --output_dir {tmp_dir}
            --model_name_or_path google/vit-base-patch16-224-in21k
            --dataset_name hf-internal-testing/cats_vs_dogs_sample
-            --trust_remote_code
            --do_train
            --do_eval
            --learning_rate 1e-4
@ -415,7 +413,6 @@ class ExamplesTests(TestCasePlus):
            result = get_results(tmp_dir)
            self.assertGreaterEqual(result["eval_accuracy"], 0.8)

-    @unittest.skip("temporary to avoid failing on circleci")
    def test_run_speech_recognition_ctc(self):
        tmp_dir = self.get_auto_remove_tmp_dir()
        testargs = f"""
@ -426,7 +423,6 @@ class ExamplesTests(TestCasePlus):
            --dataset_config_name clean
            --train_split_name validation
            --eval_split_name validation
-            --trust_remote_code
            --do_train
            --do_eval
            --learning_rate 1e-4
@ -447,7 +443,6 @@ class ExamplesTests(TestCasePlus):
            result = get_results(tmp_dir)
            self.assertLess(result["eval_loss"], result["train_loss"])

-    @unittest.skip("temporary to avoid failing on circleci")
    def test_run_speech_recognition_ctc_adapter(self):
        tmp_dir = self.get_auto_remove_tmp_dir()
        testargs = f"""
@ -458,7 +453,6 @@ class ExamplesTests(TestCasePlus):
            --dataset_config_name clean
            --train_split_name validation
            --eval_split_name validation
-            --trust_remote_code
            --do_train
            --do_eval
            --learning_rate 1e-4
@ -481,7 +475,6 @@ class ExamplesTests(TestCasePlus):
            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, "./adapter.tur.safetensors")))
            self.assertLess(result["eval_loss"], result["train_loss"])

-    @unittest.skip("temporary to avoid failing on circleci")
    def test_run_speech_recognition_seq2seq(self):
        tmp_dir = self.get_auto_remove_tmp_dir()
        testargs = f"""
@ -492,7 +485,6 @@ class ExamplesTests(TestCasePlus):
            --dataset_config_name clean
            --train_split_name validation
            --eval_split_name validation
-            --trust_remote_code
            --do_train
            --do_eval
            --learning_rate 1e-4
@ -520,7 +512,6 @@ class ExamplesTests(TestCasePlus):
            --output_dir {tmp_dir}
            --model_name_or_path hf-internal-testing/tiny-random-wav2vec2
            --dataset_name anton-l/superb_demo
-            --trust_remote_code
            --dataset_config_name ks
            --train_split_name test
            --eval_split_name test
@ -555,7 +546,6 @@ class ExamplesTests(TestCasePlus):
            --dataset_name hf-internal-testing/librispeech_asr_dummy
            --dataset_config_names clean
            --dataset_split_names validation
-            --trust_remote_code
            --learning_rate 1e-4
            --per_device_train_batch_size 4
            --per_device_eval_batch_size 4
@ -576,7 +566,6 @@ class ExamplesTests(TestCasePlus):
            run_mae.py
            --output_dir {tmp_dir}
            --dataset_name hf-internal-testing/cats_vs_dogs_sample
-            --trust_remote_code
            --do_train
            --do_eval
            --learning_rate 1e-4
--- a/examples/tensorflow/test_tensorflow_examples.py
+++ b/examples/tensorflow/test_tensorflow_examples.py
@ -315,7 +315,6 @@ class ExamplesTests(TestCasePlus):
        testargs = f"""
            run_image_classification.py
            --dataset_name hf-internal-testing/cats_vs_dogs_sample
-            --trust_remote_code
            --model_name_or_path microsoft/resnet-18
            --do_train
            --do_eval
--- a/setup.py
+++ b/setup.py
@ -103,7 +103,7 @@ _deps = [
    "codecarbon>=2.8.1",
    "cookiecutter==1.7.3",
    "dataclasses",
-    "datasets!=2.5.0",
+    "datasets!=2.5.0",  # pinned to datasets@main in create_circleci_config.py
    "deepspeed>=0.9.3",
    "diffusers",
    "dill<0.3.5",
--- a/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
+++ b/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
@ -206,7 +206,7 @@ def convert_audio_spectrogram_transformer_checkpoint(model_name, pytorch_dump_fo

    if "speech-commands" in model_name:
        # TODO: Convert dataset to Parquet
-        dataset = load_dataset("google/speech_commands", "v0.02", split="validation", trust_remote_code=True)
+        dataset = load_dataset("google/speech_commands", "v0.02", split="validation")
        waveform = dataset[0]["audio"]["array"]
    else:
        filepath = hf_hub_download(
--- a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
@ -444,9 +444,10 @@ class ASTModel(ASTPreTrainedModel):
        input_values (`torch.FloatTensor` of shape `(batch_size, max_length, num_mel_bins)`):
            Float values mel features extracted from the raw audio waveform. Raw audio waveform can be obtained by
            loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via
-            the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
-            [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
-            tensor of type `torch.FloatTensor`. See [`~ASTFeatureExtractor.__call__`]
+            the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
+            To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the
+            mel features, padding and conversion into a tensor of type `torch.FloatTensor`.
+            See [`~ASTFeatureExtractor.__call__`]
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
@ -534,9 +535,10 @@ class ASTForAudioClassification(ASTPreTrainedModel):
        input_values (`torch.FloatTensor` of shape `(batch_size, max_length, num_mel_bins)`):
            Float values mel features extracted from the raw audio waveform. Raw audio waveform can be obtained by
            loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via
-            the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
-            [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
-            tensor of type `torch.FloatTensor`. See [`~ASTFeatureExtractor.__call__`]
+            the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
+            To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the
+            mel features, padding and conversion into a tensor of type `torch.FloatTensor`.
+            See [`~ASTFeatureExtractor.__call__`]
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the audio classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
--- a/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py
+++ b/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py
@ -266,7 +266,7 @@ def convert_beit_checkpoint(checkpoint_url, pytorch_dump_folder_path):
    # Check outputs on an image
    if is_semantic:
        image_processor = BeitImageProcessor(size=config.image_size, do_center_crop=False)
-        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
+        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
        image = Image.open(ds[0]["file"])
    else:
        image_processor = BeitImageProcessor(
--- a/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
@ -226,7 +226,7 @@ def convert_wav2vec2_checkpoint(

    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-lv60")

-    ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
+    ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
    input_audio = [x["array"] for x in ds[:4]["audio"]]

    inputs = processor(input_audio, return_tensors="pt", padding=True)
--- a/src/transformers/models/data2vec/modeling_data2vec_audio.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py
@ -1066,9 +1066,10 @@ class Data2VecAudioForSequenceClassification(Data2VecAudioPreTrainedModel):
        r"""
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
-            into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
-            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
-            conversion into a tensor of type `torch.FloatTensor`. See [`Data2VecAudioProcessor.__call__`] for details.
+            into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
+            torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
+            the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
+            See [`Data2VecAudioProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@ -1180,9 +1181,10 @@ class Data2VecAudioForAudioFrameClassification(Data2VecAudioPreTrainedModel):
        r"""
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
-            into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
-            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
-            conversion into a tensor of type `torch.FloatTensor`. See [`Data2VecAudioProcessor.__call__`] for details.
+            into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
+            torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
+            the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
+            See [`Data2VecAudioProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@ -1362,9 +1364,10 @@ class Data2VecAudioForXVector(Data2VecAudioPreTrainedModel):
        r"""
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
-            into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
-            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
-            conversion into a tensor of type `torch.FloatTensor`. See [`Data2VecAudioProcessor.__call__`] for details.
+            into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
+            torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
+            the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
+            See [`Data2VecAudioProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
--- a/src/transformers/models/hubert/modeling_hubert.py
+++ b/src/transformers/models/hubert/modeling_hubert.py
@ -1006,15 +1006,15 @@ class HubertModel(HubertPreTrainedModel):
        ```python
        >>> from transformers import AutoProcessor, HubertModel
        >>> from datasets import load_dataset
-        >>> import soundfile as sf
+        >>> from torchcodec.decoders import AudioDecoder

        >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
        >>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")


        >>> def map_to_array(batch):
-        ...     speech, _ = sf.read(batch["file"])
-        ...     batch["speech"] = speech
+        ...     decoder = AudioDecoder(batch["file"])
+        ...     batch["speech"] = decoder.get_all_samples().data
        ...     return batch


@ -1282,9 +1282,10 @@ class HubertForSequenceClassification(HubertPreTrainedModel):
        r"""
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
-            into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
-            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
-            conversion into a tensor of type `torch.FloatTensor`. See [`HubertProcessor.__call__`] for details.
+            into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
+            torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
+            the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
+            See [`HubertProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
--- a/src/transformers/models/hubert/modeling_tf_hubert.py
+++ b/src/transformers/models/hubert/modeling_tf_hubert.py
@ -1459,15 +1459,15 @@ class TFHubertModel(TFHubertPreTrainedModel):
        ```python
        >>> from transformers import AutoProcessor, TFHubertModel
        >>> from datasets import load_dataset
-        >>> import soundfile as sf
+        >>> from torchcodec.decoders import AudioDecoder

        >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
        >>> model = TFHubertModel.from_pretrained("facebook/hubert-large-ls960-ft")


        >>> def map_to_array(batch):
-        ...     speech, _ = sf.read(batch["file"])
-        ...     batch["speech"] = speech
+        ...     decoder = AudioDecoder(batch["file"])
+        ...     batch["speech"] = decoder.get_all_samples().data
        ...     return batch


@ -1571,15 +1571,15 @@ class TFHubertForCTC(TFHubertPreTrainedModel):
        >>> import tensorflow as tf
        >>> from transformers import AutoProcessor, TFHubertForCTC
        >>> from datasets import load_dataset
-        >>> import soundfile as sf
+        >>> from torchcodec.decoders import AudioDecoder

        >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
        >>> model = TFHubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")


        >>> def map_to_array(batch):
-        ...     speech, _ = sf.read(batch["file"])
-        ...     batch["speech"] = speech
+        ...     decoder = AudioDecoder(batch["file"])
+        ...     batch["speech"] = decoder.get_all_samples().data
        ...     return batch


--- a/src/transformers/models/hubert/modular_hubert.py
+++ b/src/transformers/models/hubert/modular_hubert.py
@ -239,15 +239,15 @@ class HubertModel(Wav2Vec2Model, HubertPreTrainedModel):
        ```python
        >>> from transformers import AutoProcessor, HubertModel
        >>> from datasets import load_dataset
-        >>> import soundfile as sf
+        >>> from torchcodec.decoders import AudioDecoder

        >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
        >>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")


        >>> def map_to_array(batch):
-        ...     speech, _ = sf.read(batch["file"])
-        ...     batch["speech"] = speech
+        ...     decoder = AudioDecoder(batch["file"])
+        ...     batch["speech"] = decoder.get_all_samples().data
        ...     return batch


--- a/src/transformers/models/layoutlm/modeling_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_layoutlm.py
@ -1223,7 +1223,7 @@ class LayoutLMForQuestionAnswering(LayoutLMPreTrainedModel):
        >>> tokenizer = AutoTokenizer.from_pretrained("impira/layoutlm-document-qa", add_prefix_space=True)
        >>> model = LayoutLMForQuestionAnswering.from_pretrained("impira/layoutlm-document-qa", revision="1e3ebac")

-        >>> dataset = load_dataset("nielsr/funsd", split="train", trust_remote_code=True)
+        >>> dataset = load_dataset("nielsr/funsd", split="train")
        >>> example = dataset[0]
        >>> question = "what's his name?"
        >>> words = example["words"]
--- a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
@ -1601,7 +1601,7 @@ class TFLayoutLMForQuestionAnswering(TFLayoutLMPreTrainedModel, TFQuestionAnswer
        >>> tokenizer = AutoTokenizer.from_pretrained("impira/layoutlm-document-qa", add_prefix_space=True)
        >>> model = TFLayoutLMForQuestionAnswering.from_pretrained("impira/layoutlm-document-qa", revision="1e3ebac")

-        >>> dataset = load_dataset("nielsr/funsd", split="train", trust_remote_code=True)
+        >>> dataset = load_dataset("nielsr/funsd", split="train")
        >>> example = dataset[0]
        >>> question = "what's his name?"
        >>> words = example["words"]
--- a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
@ -763,9 +763,8 @@ class LayoutLMv2Model(LayoutLMv2PreTrainedModel):
        >>> model = LayoutLMv2Model.from_pretrained("microsoft/layoutlmv2-base-uncased")


-        >>> dataset = load_dataset("hf-internal-testing/fixtures_docvqa", trust_remote_code=True)
-        >>> image_path = dataset["test"][0]["file"]
-        >>> image = Image.open(image_path).convert("RGB")
+        >>> dataset = load_dataset("hf-internal-testing/fixtures_docvqa")
+        >>> image = dataset["test"][0]["image"]

        >>> encoding = processor(image, return_tensors="pt")

@ -953,7 +952,7 @@ class LayoutLMv2ForSequenceClassification(LayoutLMv2PreTrainedModel):

        >>> set_seed(0)

-        >>> dataset = load_dataset("aharley/rvl_cdip", split="train", streaming=True, trust_remote_code=True)
+        >>> dataset = load_dataset("aharley/rvl_cdip", split="train", streaming=True)
        >>> data = next(iter(dataset))
        >>> image = data["image"].convert("RGB")

@ -1155,7 +1154,7 @@ class LayoutLMv2ForTokenClassification(LayoutLMv2PreTrainedModel):

        >>> set_seed(0)

-        >>> datasets = load_dataset("nielsr/funsd", split="test", trust_remote_code=True)
+        >>> datasets = load_dataset("nielsr/funsd", split="test")
        >>> labels = datasets.features["ner_tags"].feature.names
        >>> id2label = {v: k for v, k in enumerate(labels)}

@ -1312,9 +1311,8 @@ class LayoutLMv2ForQuestionAnswering(LayoutLMv2PreTrainedModel):
        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased")
        >>> model = LayoutLMv2ForQuestionAnswering.from_pretrained("microsoft/layoutlmv2-base-uncased")

-        >>> dataset = load_dataset("hf-internal-testing/fixtures_docvqa", trust_remote_code=True)
-        >>> image_path = dataset["test"][0]["file"]
-        >>> image = Image.open(image_path).convert("RGB")
+        >>> dataset = load_dataset("hf-internal-testing/fixtures_docvqa")
+        >>> image = dataset["test"][0]["image"]
        >>> question = "When is coffee break?"
        >>> encoding = processor(image, question, return_tensors="pt")

--- a/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py
@ -746,7 +746,7 @@ class LayoutLMv3Model(LayoutLMv3PreTrainedModel):
        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = AutoModel.from_pretrained("microsoft/layoutlmv3-base")

-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
@ -961,7 +961,7 @@ class LayoutLMv3ForTokenClassification(LayoutLMv3PreTrainedModel):
        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = AutoModelForTokenClassification.from_pretrained("microsoft/layoutlmv3-base", num_labels=7)

-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
@ -1062,7 +1062,7 @@ class LayoutLMv3ForQuestionAnswering(LayoutLMv3PreTrainedModel):
        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = AutoModelForQuestionAnswering.from_pretrained("microsoft/layoutlmv3-base")

-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> question = "what's his name?"
@ -1182,7 +1182,7 @@ class LayoutLMv3ForSequenceClassification(LayoutLMv3PreTrainedModel):
        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = AutoModelForSequenceClassification.from_pretrained("microsoft/layoutlmv3-base")

-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
--- a/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py
@ -1296,7 +1296,7 @@ class TFLayoutLMv3Model(TFLayoutLMv3PreTrainedModel):
        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = TFAutoModel.from_pretrained("microsoft/layoutlmv3-base")

-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
@ -1439,7 +1439,7 @@ class TFLayoutLMv3ForSequenceClassification(TFLayoutLMv3PreTrainedModel, TFSeque
        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = TFAutoModelForSequenceClassification.from_pretrained("microsoft/layoutlmv3-base")

-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
@ -1566,7 +1566,7 @@ class TFLayoutLMv3ForTokenClassification(TFLayoutLMv3PreTrainedModel, TFTokenCla
        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = TFAutoModelForTokenClassification.from_pretrained("microsoft/layoutlmv3-base", num_labels=7)

-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
@ -1703,7 +1703,7 @@ class TFLayoutLMv3ForQuestionAnswering(TFLayoutLMv3PreTrainedModel, TFQuestionAn
        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = TFAutoModelForQuestionAnswering.from_pretrained("microsoft/layoutlmv3-base")

-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> question = "what's his name?"
--- a/src/transformers/models/lilt/modeling_lilt.py
+++ b/src/transformers/models/lilt/modeling_lilt.py
@ -653,7 +653,7 @@ class LiltModel(LiltPreTrainedModel):
        >>> tokenizer = AutoTokenizer.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")
        >>> model = AutoModel.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")

-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]
@ -793,7 +793,7 @@ class LiltForSequenceClassification(LiltPreTrainedModel):
        >>> tokenizer = AutoTokenizer.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")
        >>> model = AutoModelForSequenceClassification.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")

-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]
@ -908,7 +908,7 @@ class LiltForTokenClassification(LiltPreTrainedModel):
        >>> tokenizer = AutoTokenizer.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")
        >>> model = AutoModelForTokenClassification.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")

-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]
@ -1025,7 +1025,7 @@ class LiltForQuestionAnswering(LiltPreTrainedModel):
        >>> tokenizer = AutoTokenizer.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")
        >>> model = AutoModelForQuestionAnswering.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")

-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]
--- a/src/transformers/models/moonshine/modeling_moonshine.py
+++ b/src/transformers/models/moonshine/modeling_moonshine.py
@ -563,7 +563,8 @@ class MoonshineEncoder(MoonshinePreTrainedModel):
            input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
                Float values of the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
-                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
+                `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
+                the soundfile library (`pip install soundfile`). To prepare the array into
                `input_values`, the [`AutoFeatureExtractor`] should be used for padding
                and conversion into a tensor of type `torch.FloatTensor`.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
@ -1028,7 +1029,8 @@ class MoonshineModel(MoonshinePreTrainedModel):
        input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
            Float values of the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
-            `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
+            `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
+            the soundfile library (`pip install soundfile`). To prepare the array into
            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`.
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
@ -1204,7 +1206,8 @@ class MoonshineForConditionalGeneration(MoonshinePreTrainedModel, GenerationMixi
        input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
            Float values of the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
-            `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
+            `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
+            the soundfile library (`pip install soundfile`). To prepare the array into
            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`.
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
--- a/src/transformers/models/moonshine/modular_moonshine.py
+++ b/src/transformers/models/moonshine/modular_moonshine.py
@ -587,7 +587,8 @@ class MoonshineEncoder(MoonshinePreTrainedModel):
            input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
                Float values of the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
-                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
+                `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
+                the soundfile library (`pip install soundfile`). To prepare the array into
                `input_values`, the [`AutoFeatureExtractor`] should be used for padding
                and conversion into a tensor of type `torch.FloatTensor`.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
@ -844,7 +845,8 @@ class MoonshineModel(WhisperModel):
        input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
            Float values of the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
-            `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
+            `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
+            the soundfile library (`pip install soundfile`). To prepare the array into
            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`.
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
@ -1004,7 +1006,8 @@ class MoonshineForConditionalGeneration(MoonshinePreTrainedModel, GenerationMixi
        input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
            Float values of the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
-            `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
+            `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
+            the soundfile library (`pip install soundfile`). To prepare the array into
            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`.
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
--- a/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py
+++ b/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py
@ -852,7 +852,8 @@ class Qwen2_5OmniAudioEncoder(Qwen2_5OmniPreTrainedModel):
        input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
            Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
-            `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
+            `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
+            the soundfile library (`pip install soundfile`). To prepare the array into
            `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
            and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
        feature_lens (`torch.LongTensor` of shape `(batch_size,)`):
@ -1945,9 +1946,10 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo
        input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, feature_sequence_length)`):
            Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
            loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via
-            the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
-            [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
-            tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
+            the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
+            To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the
+            mel features, padding and conversion into a tensor of type `torch.FloatTensor`.
+            See [`~WhisperFeatureExtractor.__call__`]
        pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size), *optional*):
            The tensors corresponding to the input videos. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`SiglipImageProcessor.__call__`] for details ([]`NewTaskModelProcessor`] uses
--- a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py
+++ b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py
@ -1863,7 +1863,8 @@ class Qwen2_5OmniAudioEncoder(Qwen2_5OmniPreTrainedModel):
        input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
            Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
-            `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
+            `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
+            the soundfile library (`pip install soundfile`). To prepare the array into
            `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
            and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
        feature_lens (`torch.LongTensor` of shape `(batch_size,)`):
@ -2367,9 +2368,10 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo
        input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, feature_sequence_length)`):
            Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
            loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via
-            the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
-            [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
-            tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
+            the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
+            To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the
+            mel features, padding and conversion into a tensor of type `torch.FloatTensor`.
+            See [`~WhisperFeatureExtractor.__call__`]
        pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size), *optional*):
            The tensors corresponding to the input videos. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`SiglipImageProcessor.__call__`] for details ([]`NewTaskModelProcessor`] uses
--- a/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py
+++ b/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py
@ -368,7 +368,8 @@ class Qwen2AudioEncoder(Qwen2AudioPreTrainedModel):
            input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
-                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
+                `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
+                the soundfile library (`pip install soundfile`). To prepare the array into
                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
                and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
            attention_mask (`torch.Tensor`)`, *optional*):
@ -757,9 +758,10 @@ class Qwen2AudioForConditionalGeneration(Qwen2AudioPreTrainedModel, GenerationMi
        input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, feature_sequence_length)`):
            Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
            loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via
-            the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
-            [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
-            tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
+            the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
+            To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the
+            mel features, padding and conversion into a tensor of type `torch.FloatTensor`.
+            See [`~WhisperFeatureExtractor.__call__`]
        feature_attention_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
            Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:

--- a/src/transformers/models/sew/modeling_sew.py
+++ b/src/transformers/models/sew/modeling_sew.py
@ -1090,9 +1090,10 @@ class SEWForSequenceClassification(SEWPreTrainedModel):
        r"""
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
-            into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
-            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
-            conversion into a tensor of type `torch.FloatTensor`. See [`SEWProcessor.__call__`] for details.
+            into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
+            torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
+            the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
+            See [`SEWProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
--- a/src/transformers/models/sew_d/modeling_sew_d.py
+++ b/src/transformers/models/sew_d/modeling_sew_d.py
@ -1613,9 +1613,10 @@ class SEWDForSequenceClassification(SEWDPreTrainedModel):
        r"""
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
-            into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
-            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
-            conversion into a tensor of type `torch.FloatTensor`. See [`SEWDProcessor.__call__`] for details.
+            into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
+            torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
+            the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
+            See [`SEWDProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
--- a/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py
+++ b/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py
@ -86,8 +86,9 @@ SPEECH_ENCODER_DECODER_INPUTS_DOCSTRING = r"""
    Args:
        inputs (`jnp.ndarray` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, feature_dim)`, *optional*):
            Float values of input raw speech waveform or speech features. Values can be obtained by loading a `.flac`
-            or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile
-            library (`pip install soundfile`). To prepare the array into `inputs`, either the [`Wav2Vec2Processor`] or
+            or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip
+            install torchcodec`) or the soundfile library (`pip install soundfile`).
+            To prepare the array into `inputs`, either the [`Wav2Vec2Processor`] or
            [`Speech2TextProcessor`] should be used for padding and conversion into a tensor of type
            `torch.FloatTensor`.
        attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
--- a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
+++ b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
@ -339,8 +339,9 @@ class SpeechEncoderDecoderModel(PreTrainedModel, GenerationMixin):
        r"""
        inputs (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, feature_dim)`, *optional*):
            Float values of input raw speech waveform or speech features. Values can be obtained by loading a `.flac`
-            or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile
-            library (`pip install soundfile`). To prepare the array into `inputs`, either the [`Wav2Vec2Processor`] or
+            or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip
+            install torchcodec`) or the soundfile library (`pip install soundfile`).
+            To prepare the array into `inputs`, either the [`Wav2Vec2Processor`] or
            [`Speech2TextProcessor`] should be used for padding and conversion into a tensor of type
            `torch.FloatTensor`.
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
--- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
@ -619,7 +619,8 @@ class Speech2TextEncoder(Speech2TextPreTrainedModel):
            input_features (`torch.LongTensor` of shape `(batch_size, sequence_length, feature_size)`):
                Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
-                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
+                `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
+            the soundfile library (`pip install soundfile`). To prepare the array into
                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the fbank features,
                padding and conversion into a tensor of type `torch.FloatTensor`. See
                [`~Speech2TextFeatureExtractor.__call__`]
--- a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
@ -848,7 +848,8 @@ class TFSpeech2TextEncoder(keras.layers.Layer):
            input_features (`tf.Tensor` of shape `(batch_size, sequence_length, feature_size)`):
                Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
-                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
+                `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
+            the soundfile library (`pip install soundfile`). To prepare the array into
                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the fbank features,
                padding and conversion into a tensor of floats. See [`~Speech2TextFeatureExtractor.__call__`]
            attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
@ -1469,7 +1470,7 @@ class TFSpeech2TextForConditionalGeneration(TFSpeech2TextPreTrainedModel, TFCaus
        >>> import tensorflow as tf
        >>> from transformers import Speech2TextProcessor, TFSpeech2TextForConditionalGeneration
        >>> from datasets import load_dataset
-        >>> import soundfile as sf
+        >>> from torchcodec.decoders import AudioDecoder

        >>> model = TFSpeech2TextForConditionalGeneration.from_pretrained(
        ...     "facebook/s2t-small-librispeech-asr", from_pt=True
@ -1478,8 +1479,8 @@ class TFSpeech2TextForConditionalGeneration(TFSpeech2TextPreTrainedModel, TFCaus


        >>> def map_to_array(batch):
-        ...     speech, _ = sf.read(batch["file"])
-        ...     batch["speech"] = speech
+        ...     decoder = AudioDecoder(batch["file"])
+        ...     batch["speech"] = decoder.get_all_samples().data
        ...     return batch


--- a/src/transformers/models/speecht5/modeling_speecht5.py
+++ b/src/transformers/models/speecht5/modeling_speecht5.py
@ -2228,7 +2228,7 @@ class SpeechT5ForSpeechToText(SpeechT5PreTrainedModel, GenerationMixin):
        >>> from datasets import load_dataset

        >>> dataset = load_dataset(
-        ...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True
+        ...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation"
        ... )  # doctest: +IGNORE_RESULT
        >>> dataset = dataset.sort("id")
        >>> sampling_rate = dataset.features["audio"].sampling_rate
@ -2909,7 +2909,7 @@ class SpeechT5ForSpeechToSpeech(SpeechT5PreTrainedModel):
        >>> import torch

        >>> dataset = load_dataset(
-        ...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True
+        ...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation"
        ... )  # doctest: +IGNORE_RESULT
        >>> dataset = dataset.sort("id")
        >>> sampling_rate = dataset.features["audio"].sampling_rate
--- a/src/transformers/models/udop/modeling_udop.py
+++ b/src/transformers/models/udop/modeling_udop.py
@ -1604,7 +1604,7 @@ class UdopModel(UdopPreTrainedModel):

        >>> # load an example image, along with the words and coordinates
        >>> # which were extracted using an OCR engine
-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
@ -1813,7 +1813,7 @@ class UdopForConditionalGeneration(UdopPreTrainedModel, GenerationMixin):

        >>> # load an example image, along with the words and coordinates
        >>> # which were extracted using an OCR engine
-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
@ -2025,7 +2025,7 @@ class UdopEncoderModel(UdopPreTrainedModel):

        >>> # load an example image, along with the words and coordinates
        >>> # which were extracted using an OCR engine
-        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
+        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
--- a/src/transformers/models/unispeech/modeling_unispeech.py
+++ b/src/transformers/models/unispeech/modeling_unispeech.py
@ -1514,9 +1514,10 @@ class UniSpeechForSequenceClassification(UniSpeechPreTrainedModel):
        r"""
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
-            into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
-            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
-            conversion into a tensor of type `torch.FloatTensor`. See [`UniSpeechProcessor.__call__`] for details.
+            into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
+            torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
+            the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
+            See [`UniSpeechProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
--- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
@ -1507,9 +1507,10 @@ class UniSpeechSatForSequenceClassification(UniSpeechSatPreTrainedModel):
        r"""
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
-            into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
-            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
-            conversion into a tensor of type `torch.FloatTensor`. See [`UniSpeechSatProcessor.__call__`] for details.
+            into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
+            torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
+            the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
+            See [`UniSpeechSatProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@ -1621,9 +1622,10 @@ class UniSpeechSatForAudioFrameClassification(UniSpeechSatPreTrainedModel):
        r"""
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
-            into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
-            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
-            conversion into a tensor of type `torch.FloatTensor`. See [`UniSpeechSatProcessor.__call__`] for details.
+            into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
+            torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
+            the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
+            See [`UniSpeechSatProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@ -1803,9 +1805,10 @@ class UniSpeechSatForXVector(UniSpeechSatPreTrainedModel):
        r"""
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
-            into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
-            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
-            conversion into a tensor of type `torch.FloatTensor`. See [`UniSpeechSatProcessor.__call__`] for details.
+            into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
+            torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
+            the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
+            See [`UniSpeechSatProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
--- a/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py
@ -1064,15 +1064,15 @@ FLAX_WAV2VEC2_MODEL_DOCSTRING = """
    ```python
    >>> from transformers import AutoProcessor, FlaxWav2Vec2Model
    >>> from datasets import load_dataset
-    >>> import soundfile as sf
+    >>> from torchcodec.decoders import AudioDecoder

    >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-large-lv60")
    >>> model = FlaxWav2Vec2Model.from_pretrained("facebook/wav2vec2-large-lv60")


    >>> def map_to_array(batch):
-    ...     speech, _ = sf.read(batch["file"])
-    ...     batch["speech"] = speech
+    ...     decoder = AudioDecoder(batch["file"])
+    ...     batch["speech"] = decoder.get_all_samples().data
    ...     return batch


@ -1183,15 +1183,15 @@ FLAX_WAV2VEC2_FOR_CTC_DOCSTRING = """
    >>> import jax.numpy as jnp
    >>> from transformers import AutoProcessor, FlaxWav2Vec2ForCTC
    >>> from datasets import load_dataset
-    >>> import soundfile as sf
+        >>> from torchcodec.decoders import AudioDecoder

    >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-large-960h-lv60")
    >>> model = FlaxWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60")


    >>> def map_to_array(batch):
-    ...     speech, _ = sf.read(batch["file"])
-    ...     batch["speech"] = speech
+    ...     decoder = AudioDecoder(batch["file"])
+    ...     batch["speech"] = decoder.get_all_samples().data
    ...     return batch


@ -1384,15 +1384,15 @@ FLAX_WAV2VEC2_FOR_PRETRAINING_DOCSTRING = """
    >>> from transformers import AutoFeatureExtractor, FlaxWav2Vec2ForPreTraining
    >>> from transformers.models.wav2vec2.modeling_flax_wav2vec2 import _compute_mask_indices
    >>> from datasets import load_dataset
-    >>> import soundfile as sf
+    >>> from torchcodec.decoders import AudioDecoder

    >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-large-lv60")
    >>> model = FlaxWav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-large-lv60")


    >>> def map_to_array(batch):
-    ...     speech, _ = sf.read(batch["file"])
-    ...     batch["speech"] = speech
+    ...     decoder = AudioDecoder(batch["file"])
+    ...     batch["speech"] = decoder.get_all_samples().data
    ...     return batch


--- a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
@ -1530,15 +1530,15 @@ class TFWav2Vec2Model(TFWav2Vec2PreTrainedModel):
        ```python
        >>> from transformers import AutoProcessor, TFWav2Vec2Model
        >>> from datasets import load_dataset
-        >>> import soundfile as sf
+        >>> from torchcodec.decoders import AudioDecoder

        >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
        >>> model = TFWav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")


        >>> def map_to_array(batch):
-        ...     speech, _ = sf.read(batch["file"])
-        ...     batch["speech"] = speech
+        ...     decoder = AudioDecoder(batch["file"])
+        ...     batch["speech"] = decoder.get_all_samples().data
        ...     return batch


@ -1642,15 +1642,15 @@ class TFWav2Vec2ForCTC(TFWav2Vec2PreTrainedModel):
        >>> import tensorflow as tf
        >>> from transformers import AutoProcessor, TFWav2Vec2ForCTC
        >>> from datasets import load_dataset
-        >>> import soundfile as sf
+        >>> from torchcodec.decoders import AudioDecoder

        >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
        >>> model = TFWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")


        >>> def map_to_array(batch):
-        ...     speech, _ = sf.read(batch["file"])
-        ...     batch["speech"] = speech
+        ...     decoder = AudioDecoder(batch["file"])
+        ...     batch["speech"] = decoder.get_all_samples().data
        ...     return batch


--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@ -2040,9 +2040,10 @@ class Wav2Vec2ForSequenceClassification(Wav2Vec2PreTrainedModel):
        r"""
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
-            into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
-            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
-            conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details.
+            into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
+            torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
+            the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
+            See [`Wav2Vec2Processor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@ -2154,9 +2155,10 @@ class Wav2Vec2ForAudioFrameClassification(Wav2Vec2PreTrainedModel):
        r"""
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
-            into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
-            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
-            conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details.
+            into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
+            torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
+            the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
+            See [`Wav2Vec2Processor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@ -2336,9 +2338,10 @@ class Wav2Vec2ForXVector(Wav2Vec2PreTrainedModel):
        r"""
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
-            into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
-            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
-            conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details.
+            into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
+            torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
+            the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
+            See [`Wav2Vec2Processor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
--- a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
@ -590,7 +590,7 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):
        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")

        >>> # load first sample of English common_voice
-        >>> dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", split="train", streaming=True, trust_remote_code=True)
+        >>> dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", split="train", streaming=True)
        >>> dataset = dataset.cast_column("audio", datasets.Audio(sampling_rate=16_000))
        >>> dataset_iter = iter(dataset)
        >>> sample = next(dataset_iter)
--- a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
+++ b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
@ -1600,9 +1600,10 @@ class Wav2Vec2ConformerForSequenceClassification(Wav2Vec2ConformerPreTrainedMode
        r"""
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
-            into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
-            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
-            conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2ConformerProcessor.__call__`] for details.
+            into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
+            torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
+            the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
+            See [`Wav2Vec2ConformerProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@ -1702,9 +1703,10 @@ class Wav2Vec2ConformerForAudioFrameClassification(Wav2Vec2ConformerPreTrainedMo
        r"""
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
-            into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
-            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
-            conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2ConformerProcessor.__call__`] for details.
+            into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
+            torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
+            the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
+            See [`Wav2Vec2ConformerProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@ -1872,9 +1874,10 @@ class Wav2Vec2ConformerForXVector(Wav2Vec2ConformerPreTrainedModel):
        r"""
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
-            into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
-            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
-            conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2ConformerProcessor.__call__`] for details.
+            into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
+            torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
+            the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
+            See [`Wav2Vec2ConformerProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
--- a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
+++ b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
@ -546,7 +546,7 @@ class Wav2Vec2ProcessorWithLM(ProcessorMixin):
        >>> processor = AutoProcessor.from_pretrained("patrickvonplaten/wav2vec2-base-100h-with-lm")

        >>> # load first sample of English common_voice
-        >>> dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", split="train", streaming=True, trust_remote_code=True)
+        >>> dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", split="train", streaming=True)
        >>> dataset = dataset.cast_column("audio", datasets.Audio(sampling_rate=16_000))
        >>> dataset_iter = iter(dataset)
        >>> sample = next(dataset_iter)
--- a/src/transformers/models/wavlm/modeling_wavlm.py
+++ b/src/transformers/models/wavlm/modeling_wavlm.py
@ -1351,9 +1351,10 @@ class WavLMForSequenceClassification(WavLMPreTrainedModel):
        r"""
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
-            into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
-            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
-            conversion into a tensor of type `torch.FloatTensor`. See [`WavLMProcessor.__call__`] for details.
+            into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
+            torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
+            the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
+            See [`WavLMProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@ -1465,9 +1466,10 @@ class WavLMForAudioFrameClassification(WavLMPreTrainedModel):
        r"""
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
-            into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
-            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
-            conversion into a tensor of type `torch.FloatTensor`. See [`WavLMProcessor.__call__`] for details.
+            into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
+            torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
+            the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
+            See [`WavLMProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@ -1647,9 +1649,10 @@ class WavLMForXVector(WavLMPreTrainedModel):
        r"""
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
-            into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
-            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
-            conversion into a tensor of type `torch.FloatTensor`. See [`WavLMProcessor.__call__`] for details.
+            into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
+            torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
+            the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
+            See [`WavLMProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
--- a/src/transformers/models/whisper/modeling_flax_whisper.py
+++ b/src/transformers/models/whisper/modeling_flax_whisper.py
@ -102,9 +102,10 @@ WHISPER_INPUTS_DOCSTRING = r"""
        input_features (`numpy.ndarray` of shape `(batch_size, feature_size, sequence_length)`):
            Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
            loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via
-            the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
-            [`WhisperFeatureExtractor`] should be used for extracting the features, padding and conversion into a
-            tensor of type `numpy.ndarray`. See [`~WhisperFeatureExtractor.__call__`]
+            the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
+            To prepare the array into `input_features`, the [`WhisperFeatureExtractor`] should be used for extracting
+            the features, padding and conversion into a tensor of type `numpy.ndarray`.
+            See [`~WhisperFeatureExtractor.__call__`]
        attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Whisper does not support masking of the `input_features`, this argument is preserved for compatibility, but
            is not used. By default the silence in the input log mel spectrogram are ignored.
@ -139,9 +140,10 @@ WHISPER_ENCODE_INPUTS_DOCSTRING = r"""
        input_features (`numpy.ndarray` of shape `(batch_size, feature_size, sequence_length)`):
            Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
            loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via
-            the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
-            [`WhisperFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
-            tensor of type `numpy.ndarray`. See [`~WhisperFeatureExtractor.__call__`].
+            the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
+            To prepare the array into `input_features`, the [`WhisperFeatureExtractor`] should be used for extracting
+            the mel features, padding and conversion into a tensor of type `numpy.ndarray`.
+            See [`~WhisperFeatureExtractor.__call__`].
        attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Whisper does not support masking of the `input_features`, this argument is preserved for compatibility, but
            is not used. By default the silence in the input log mel spectrogram are ignored.
@ -1670,7 +1672,7 @@ FLAX_WHISPER_AUDIO_CLASSIFICATION_DOCSTRING = r"""
    >>> model = FlaxWhisperForAudioClassification.from_pretrained(
    ...     "sanchit-gandhi/whisper-medium-fleurs-lang-id", from_pt=True
    ... )
-    >>> ds = load_dataset("google/fleurs", "all", split="validation", streaming=True, trust_remote_code=True)
+    >>> ds = load_dataset("google/fleurs", "all", split="validation", streaming=True)

    >>> sample = next(iter(ds))

--- a/src/transformers/models/whisper/modeling_tf_whisper.py
+++ b/src/transformers/models/whisper/modeling_tf_whisper.py
@ -601,9 +601,10 @@ WHISPER_INPUTS_DOCSTRING = r"""
        input_features (`tf.Tensor` of shape `(batch_size, feature_size, sequence_length)`):
            Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be obtained
            by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.*
-            via the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
-            [`AutoFeatureExtractor`] should be used for extracting the fbank features, padding and conversion into a
-            tensor of type `tf.Tensor`. See [`~WhisperFeatureExtractor.__call__`]
+            via the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
+            To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the
+            fbank features, padding and conversion into a tensor of type `tf.Tensor`.
+            See [`~WhisperFeatureExtractor.__call__`]
        decoder_input_ids (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

@ -729,7 +730,8 @@ class TFWhisperEncoder(keras.layers.Layer):
            input_features (`tf.Tensor` of shape `(batch_size, feature_size, sequence_length)`):
                Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
-                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
+                `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
+                the soundfile library (`pip install soundfile`). To prepare the array into
                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the fbank features,
                padding and conversion into a tensor of type `tf.Tensor`. See [`~WhisperFeatureExtractor.__call__`]
            head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@ -650,7 +650,8 @@ class WhisperEncoder(WhisperPreTrainedModel):
            input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
-                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
+                `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
+            the soundfile library (`pip install soundfile`). To prepare the array into
                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
                and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
            attention_mask (`torch.Tensor`)`, *optional*):
@ -1121,9 +1122,10 @@ class WhisperModel(WhisperPreTrainedModel):
        input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`):
            Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
            loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via
-            the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
-            [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
-            tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
+            the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
+            To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the
+            mel features, padding and conversion into a tensor of type `torch.FloatTensor`.
+            See [`~WhisperFeatureExtractor.__call__`]
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

@ -1291,9 +1293,10 @@ class WhisperForConditionalGeneration(WhisperGenerationMixin, WhisperPreTrainedM
        input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`):
            Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
            loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via
-            the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
-            [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
-            tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
+            the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
+            To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the
+            mel features, padding and conversion into a tensor of type `torch.FloatTensor`.
+            See [`~WhisperFeatureExtractor.__call__`]
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

@ -1625,9 +1628,10 @@ class WhisperForAudioClassification(WhisperPreTrainedModel):
        input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`):
            Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
            loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via
-            the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
-            [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
-            tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
+            the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
+            To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the
+            mel features, padding and conversion into a tensor of type `torch.FloatTensor`.
+            See [`~WhisperFeatureExtractor.__call__`]
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@ -129,7 +129,6 @@ from .utils import (
    is_scipy_available,
    is_sentencepiece_available,
    is_seqio_available,
-    is_soundfile_available,
    is_spacy_available,
    is_speech_available,
    is_spqr_available,
@ -158,6 +157,7 @@ from .utils import (
    is_torch_xpu_available,
    is_torchao_available,
    is_torchaudio_available,
+    is_torchcodec_available,
    is_torchdynamo_available,
    is_torchvision_available,
    is_vision_available,
@ -634,6 +634,16 @@ def require_torchvision(test_case):
    return unittest.skipUnless(is_torchvision_available(), "test requires Torchvision")(test_case)


+def require_torchcodec(test_case):
+    """
+    Decorator marking a test that requires Torchcodec.
+
+    These tests are skipped when Torchcodec isn't installed.
+
+    """
+    return unittest.skipUnless(is_torchcodec_available(), "test requires Torchcodec")(test_case)
+
+
 def require_torch_or_tf(test_case):
    """
    Decorator marking a test that requires PyTorch or TensorFlow.
@ -1229,16 +1239,6 @@ def require_clearml(test_case):
    return unittest.skipUnless(is_clearml_available(), "test requires clearml")(test_case)


-def require_soundfile(test_case):
-    """
-    Decorator marking a test that requires soundfile
-
-    These tests are skipped when soundfile isn't installed.
-
-    """
-    return unittest.skipUnless(is_soundfile_available(), "test requires soundfile")(test_case)
-
-
 def require_deepspeed(test_case):
    """
    Decorator marking a test that requires deepspeed
--- a/src/transformers/utils/init.py
+++ b/src/transformers/utils/init.py
@ -253,6 +253,7 @@ from .import_utils import (
    is_torch_xpu_available,
    is_torchao_available,
    is_torchaudio_available,
+    is_torchcodec_available,
    is_torchdistx_available,
    is_torchdynamo_available,
    is_torchdynamo_compiling,
--- a/src/transformers/utils/args_doc.py
+++ b/src/transformers/utils/args_doc.py
@ -239,9 +239,10 @@ class ModelArgs:
    input_values = {
        "description": """
    Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
-    into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
-    soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
-    conversion into a tensor of type `torch.FloatTensor`. See [`{processor_class}.__call__`] for details.
+    into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
+    torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
+    the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
+    See [`{processor_class}.__call__`] for details.
    """,
        "shape": "of shape `(batch_size, sequence_length)`",
    }
--- a/src/transformers/utils/doc.py
+++ b/src/transformers/utils/doc.py
@ -423,7 +423,7 @@ PT_SPEECH_BASE_MODEL_SAMPLE = r"""
    >>> import torch
    >>> from datasets import load_dataset

-    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
    >>> dataset = dataset.sort("id")
    >>> sampling_rate = dataset.features["audio"].sampling_rate

@ -449,7 +449,7 @@ PT_SPEECH_CTC_SAMPLE = r"""
    >>> from datasets import load_dataset
    >>> import torch

-    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
    >>> dataset = dataset.sort("id")
    >>> sampling_rate = dataset.features["audio"].sampling_rate

@ -484,7 +484,7 @@ PT_SPEECH_SEQ_CLASS_SAMPLE = r"""
    >>> from datasets import load_dataset
    >>> import torch

-    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
    >>> dataset = dataset.sort("id")
    >>> sampling_rate = dataset.features["audio"].sampling_rate

@ -520,7 +520,7 @@ PT_SPEECH_FRAME_CLASS_SAMPLE = r"""
    >>> from datasets import load_dataset
    >>> import torch

-    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
    >>> dataset = dataset.sort("id")
    >>> sampling_rate = dataset.features["audio"].sampling_rate

@ -549,7 +549,7 @@ PT_SPEECH_XVECTOR_SAMPLE = r"""
    >>> from datasets import load_dataset
    >>> import torch

-    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
    >>> dataset = dataset.sort("id")
    >>> sampling_rate = dataset.features["audio"].sampling_rate

@ -584,7 +584,7 @@ PT_VISION_BASE_MODEL_SAMPLE = r"""
    >>> import torch
    >>> from datasets import load_dataset

-    >>> dataset = load_dataset("huggingface/cats-image", trust_remote_code=True)
+    >>> dataset = load_dataset("huggingface/cats-image")
    >>> image = dataset["test"]["image"][0]

    >>> image_processor = AutoImageProcessor.from_pretrained("{checkpoint}")
@ -609,7 +609,7 @@ PT_VISION_SEQ_CLASS_SAMPLE = r"""
    >>> import torch
    >>> from datasets import load_dataset

-    >>> dataset = load_dataset("huggingface/cats-image", trust_remote_code=True)
+    >>> dataset = load_dataset("huggingface/cats-image")
    >>> image = dataset["test"]["image"][0]

    >>> image_processor = AutoImageProcessor.from_pretrained("{checkpoint}")
@ -1194,7 +1194,7 @@ TF_SPEECH_BASE_MODEL_SAMPLE = r"""
    >>> from transformers import AutoProcessor, {model_class}
    >>> from datasets import load_dataset

-    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
    >>> dataset = dataset.sort("id")
    >>> sampling_rate = dataset.features["audio"].sampling_rate

@ -1219,7 +1219,7 @@ TF_SPEECH_CTC_SAMPLE = r"""
    >>> from datasets import load_dataset
    >>> import tensorflow as tf

-    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
    >>> dataset = dataset.sort("id")
    >>> sampling_rate = dataset.features["audio"].sampling_rate

@ -1254,7 +1254,7 @@ TF_VISION_BASE_MODEL_SAMPLE = r"""
    >>> from transformers import AutoImageProcessor, {model_class}
    >>> from datasets import load_dataset

-    >>> dataset = load_dataset("huggingface/cats-image", trust_remote_code=True)
+    >>> dataset = load_dataset("huggingface/cats-image")
    >>> image = dataset["test"]["image"][0]

    >>> image_processor = AutoImageProcessor.from_pretrained("{checkpoint}")
@ -1277,7 +1277,7 @@ TF_VISION_SEQ_CLASS_SAMPLE = r"""
    >>> import tensorflow as tf
    >>> from datasets import load_dataset

-    >>> dataset = load_dataset("huggingface/cats-image", trust_remote_code=True)
+    >>> dataset = load_dataset("huggingface/cats-image"))
    >>> image = dataset["test"]["image"][0]

    >>> image_processor = AutoImageProcessor.from_pretrained("{checkpoint}")
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@ -215,6 +215,7 @@ _torchaudio_available = _is_package_available("torchaudio")
 _torchao_available, _torchao_version = _is_package_available("torchao", return_version=True)
 _torchdistx_available = _is_package_available("torchdistx")
 _torchvision_available, _torchvision_version = _is_package_available("torchvision", return_version=True)
+_torchcodec_available, _torchcodec_version = _is_package_available("torchcodec", return_version=True)
 _mlx_available = _is_package_available("mlx")
 _num2words_available = _is_package_available("num2words")
 _hqq_available, _hqq_version = _is_package_available("hqq", return_version=True)
@ -432,6 +433,10 @@ def is_torchvision_available():
    return _torchvision_available


+def is_torchcodec_available():
+    return _torchcodec_available
+
+
 def is_torchvision_v2_available():
    if not is_torchvision_available():
        return False
--- a/tests/deepspeed/test_model_zoo.py
+++ b/tests/deepspeed/test_model_zoo.py
@ -269,7 +269,6 @@ def make_task_cmds():
        "img_clas": f"""
        {scripts_dir}/image-classification/run_image_classification.py
            --dataset_name hf-internal-testing/cats_vs_dogs_sample
-            --trust_remote_code
            --remove_unused_columns False
            --max_steps 10
            --image_processor_name {DS_TESTS_DIRECTORY}/vit_feature_extractor.json
--- a/tests/models/beit/test_image_processing_beit.py
+++ b/tests/models/beit/test_image_processing_beit.py
@ -27,8 +27,6 @@ if is_torch_available():
    import torch

 if is_vision_available():
-    from PIL import Image
-
    from transformers import BeitImageProcessor

    if is_torchvision_available():
@ -98,23 +96,14 @@ class BeitImageProcessingTester:


 def prepare_semantic_single_inputs():
-    dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-
-    image = Image.open(dataset[0]["file"])
-    map = Image.open(dataset[1]["file"])
-
-    return image, map
+    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+    example = ds[0]
+    return example["image"], example["map"]


 def prepare_semantic_batch_inputs():
-    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-
-    image1 = Image.open(ds[0]["file"])
-    map1 = Image.open(ds[1]["file"])
-    image2 = Image.open(ds[2]["file"])
-    map2 = Image.open(ds[3]["file"])
-
-    return [image1, image2], [map1, map2]
+    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+    return list(ds["image"][:2]), list(ds["map"][:2])


@require_torch
@ -157,7 +146,6 @@ class BeitImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
            self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
            self.assertEqual(image_processor.do_reduce_labels, True)

-    @unittest.skip("temporary to avoid failing on circleci")
    def test_call_segmentation_maps(self):
        for image_processing_class in self.image_processor_list:
            # Initialize image_processing
@ -265,7 +253,6 @@ class BeitImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
            self.assertTrue(encoding["labels"].min().item() >= 0)
            self.assertTrue(encoding["labels"].max().item() <= 255)

-    @unittest.skip("temporary to avoid failing on circleci")
    def test_reduce_labels(self):
        for image_processing_class in self.image_processor_list:
            # Initialize image_processing
@ -282,7 +269,6 @@ class BeitImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
            self.assertTrue(encoding["labels"].min().item() >= 0)
            self.assertTrue(encoding["labels"].max().item() <= 255)

-    @unittest.skip("temporary to avoid failing on circleci")
    def test_slow_fast_equivalence(self):
        if not self.test_slow_image_processor or not self.test_fast_image_processor:
            self.skipTest(reason="Skipping slow/fast equivalence test")
--- a/tests/models/beit/test_modeling_beit.py
+++ b/tests/models/beit/test_modeling_beit.py
@ -504,8 +504,8 @@ class BeitModelIntegrationTest(unittest.TestCase):

        image_processor = BeitImageProcessor(do_resize=True, size=640, do_center_crop=False)

-        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-        image = Image.open(ds[0]["file"])
+        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+        image = ds[0]["image"].convert("RGB")
        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)

        # forward pass
@ -547,8 +547,8 @@ class BeitModelIntegrationTest(unittest.TestCase):

        image_processor = BeitImageProcessor(do_resize=True, size=640, do_center_crop=False)

-        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-        image = Image.open(ds[0]["file"])
+        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+        image = ds[0]["image"].convert("RGB")
        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)

        # forward pass
--- a/tests/models/data2vec/test_modeling_data2vec_audio.py
+++ b/tests/models/data2vec/test_modeling_data2vec_audio.py
@ -21,7 +21,7 @@ from datasets import load_dataset

 from tests.test_modeling_common import floats_tensor, ids_tensor, random_attention_mask
 from transformers import Data2VecAudioConfig, is_torch_available
-from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device
+from transformers.testing_utils import require_torch, require_torchcodec, slow, torch_device

 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init
@ -656,7 +656,7 @@ class Data2VecAudioUtilsTest(unittest.TestCase):


@require_torch
-@require_soundfile
+@require_torchcodec
@slow
 class Data2VecAudioModelIntegrationTest(unittest.TestCase):
    def _load_datasamples(self, num_samples):
@ -669,7 +669,7 @@ class Data2VecAudioModelIntegrationTest(unittest.TestCase):
        return [x["array"] for x in speech_samples]

    def _load_superb(self, task, num_samples):
-        ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True)
+        ds = load_dataset("anton-l/superb_dummy", task, split="test")

        return ds[:num_samples]

--- a/tests/models/dpt/test_image_processing_dpt.py
+++ b/tests/models/dpt/test_image_processing_dpt.py
@ -29,8 +29,6 @@ if is_torch_available():
    import torch

 if is_vision_available():
-    from PIL import Image
-
    from transformers import DPTImageProcessor

    if is_torchvision_available():
@ -94,24 +92,15 @@ class DPTImageProcessingTester:

 # Copied from transformers.tests.models.beit.test_image_processing_beit.prepare_semantic_single_inputs
 def prepare_semantic_single_inputs():
-    dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-
-    image = Image.open(dataset[0]["file"])
-    map = Image.open(dataset[1]["file"])
-
-    return image, map
+    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+    example = ds[0]
+    return example["image"], example["map"]


 # Copied from transformers.tests.models.beit.test_image_processing_beit.prepare_semantic_batch_inputs
 def prepare_semantic_batch_inputs():
-    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-
-    image1 = Image.open(ds[0]["file"])
-    map1 = Image.open(ds[1]["file"])
-    image2 = Image.open(ds[2]["file"])
-    map2 = Image.open(ds[3]["file"])
-
-    return [image1, image2], [map1, map2]
+    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+    return list(ds["image"][:2]), list(ds["map"][:2])


@require_torch
@ -187,7 +176,6 @@ class DPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):

            self.assertEqual(list(pixel_values.shape), [1, 3, 512, 672])

-    @unittest.skip("temporary to avoid failing on circleci")
    # Copied from transformers.tests.models.beit.test_image_processing_beit.BeitImageProcessingTest.test_call_segmentation_maps
    def test_call_segmentation_maps(self):
        for image_processing_class in self.image_processor_list:
@ -296,7 +284,6 @@ class DPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
            self.assertTrue(encoding["labels"].min().item() >= 0)
            self.assertTrue(encoding["labels"].max().item() <= 255)

-    @unittest.skip("temporary to avoid failing on circleci")
    def test_reduce_labels(self):
        for image_processing_class in self.image_processor_list:
            image_processor = image_processing_class(**self.image_processor_dict)
@ -319,7 +306,6 @@ class DPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
            # Compare with non-reduced label to see if it's reduced by 1
            self.assertEqual(encoding["labels"][first_non_zero_coords].item(), first_non_zero_value - 1)

-    @unittest.skip("temporary to avoid failing on circleci")
    def test_slow_fast_equivalence(self):
        if not self.test_slow_image_processor or not self.test_fast_image_processor:
            self.skipTest(reason="Skipping slow/fast equivalence test")
@ -341,7 +327,6 @@ class DPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
        )
        self.assertTrue(torch.allclose(image_encoding_slow.labels, image_encoding_fast.labels, atol=1e-1))

-    @unittest.skip("temporary to avoid failing on circleci")
    def test_slow_fast_equivalence_batched(self):
        if not self.test_slow_image_processor or not self.test_fast_image_processor:
            self.skipTest(reason="Skipping slow/fast equivalence test")
--- a/tests/models/granite_speech/test_modeling_granite_speech.py
+++ b/tests/models/granite_speech/test_modeling_granite_speech.py
@ -391,7 +391,7 @@ class GraniteSpeechForConditionalGenerationIntegrationTest(unittest.TestCase):

        EXPECTED_DECODED_TEXT = [
            "systemKnowledge Cutoff Date: April 2024.\nToday's Date: December 19, 2024.\nYou are Granite, developed by IBM. You are a helpful AI assistant\nusercan you transcribe the speech into a written format?\nassistantmister quilter is the apostle of the middle classes and we are glad to welcome his gospel",
-            "systemKnowledge Cutoff Date: April 2024.\nToday's Date: December 19, 2024.\nYou are Granite, developed by IBM. You are a helpful AI assistant\nusercan you transcribe the speech into a written format?\nassistantnor is mister quilp's manner less interesting than his matter"
+            "systemKnowledge Cutoff Date: April 2024.\nToday's Date: December 19, 2024.\nYou are Granite, developed by IBM. You are a helpful AI assistant\nusercan you transcribe the speech into a written format?\nassistantnor is mister quilter's manner less interesting than his matter"
        ]  # fmt: skip

        self.assertEqual(
--- a/tests/models/hubert/test_modeling_hubert.py
+++ b/tests/models/hubert/test_modeling_hubert.py
@ -22,7 +22,7 @@ import unittest
 import pytest

 from transformers import HubertConfig, is_torch_available
-from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device
+from transformers.testing_utils import require_torch, require_torchcodec, slow, torch_device

 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
@ -750,7 +750,7 @@ class HubertUtilsTest(unittest.TestCase):


@require_torch
-@require_soundfile
+@require_torchcodec
@slow
 class HubertModelIntegrationTest(unittest.TestCase):
    def _load_datasamples(self, num_samples):
@ -767,7 +767,7 @@ class HubertModelIntegrationTest(unittest.TestCase):
    def _load_superb(self, task, num_samples):
        from datasets import load_dataset

-        ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True)
+        ds = load_dataset("anton-l/superb_dummy", task, split="test")

        return ds[:num_samples]

--- a/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py
@ -111,13 +111,13 @@ class LayoutLMv2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
    def test_layoutlmv2_integration_test(self):
        from datasets import load_dataset

-        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True)
+        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")

        for image_processing_class in self.image_processor_list:
            # with apply_OCR = True
            image_processing = image_processing_class()

-            image = Image.open(ds[0]["file"]).convert("RGB")
+            image = ds[0]["image"]

            encoding = image_processing(image, return_tensors="pt")

--- a/tests/models/layoutlmv2/test_processor_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_processor_layoutlmv2.py
@ -28,8 +28,6 @@ from ...test_processing_common import ProcessorTesterMixin


 if is_pytesseract_available():
-    from PIL import Image
-
    from transformers import LayoutLMv2ImageProcessor


@ -156,11 +154,11 @@ class LayoutLMv2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        from datasets import load_dataset

        # set up
-        datasets = load_dataset("nielsr/funsd", trust_remote_code=True)
+        datasets = load_dataset("nielsr/funsd")
        processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", revision="no_ocr")

        def preprocess_data(examples):
-            images = [Image.open(path).convert("RGB") for path in examples["image_path"]]
+            images = [image.convert("RGB") for image in examples["image"]]
            words = examples["words"]
            boxes = examples["bboxes"]
            word_labels = examples["ner_tags"]
@ -192,12 +190,8 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
        # we verify our implementation on 2 document images from the DocVQA dataset
        from datasets import load_dataset

-        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True)
-
-        image_1 = Image.open(ds[0]["file"]).convert("RGB")
-        image_2 = Image.open(ds[1]["file"]).convert("RGB")
-
-        return image_1, image_2
+        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
+        return ds[0]["image"].convert("RGB"), ds[1]["image"].convert("RGB")

    @cached_property
    def get_tokenizers(self):
--- a/tests/models/layoutlmv3/test_image_processing_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_image_processing_layoutlmv3.py
@ -22,8 +22,6 @@ from ...test_image_processing_common import ImageProcessingTestMixin, prepare_im


 if is_pytesseract_available():
-    from PIL import Image
-
    from transformers import LayoutLMv3ImageProcessor

    if is_torchvision_available():
@ -103,17 +101,16 @@ class LayoutLMv3ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
            image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42)
            self.assertEqual(image_processor.size, {"height": 42, "width": 42})

-    @unittest.skip("temporary to avoid failing on circleci")
    def test_LayoutLMv3_integration_test(self):
        from datasets import load_dataset

-        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True)
+        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")

        # with apply_OCR = True
        for image_processing_class in self.image_processor_list:
            image_processor = image_processing_class()

-            image = Image.open(ds[0]["file"]).convert("RGB")
+            image = ds[0]["image"].convert("RGB")

            encoding = image_processor(image, return_tensors="pt")

--- a/tests/models/layoutlmv3/test_processor_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_processor_layoutlmv3.py
@ -28,8 +28,6 @@ from ...test_processing_common import ProcessorTesterMixin


 if is_pytesseract_available():
-    from PIL import Image
-
    from transformers import LayoutLMv3ImageProcessor


@ -172,12 +170,8 @@ class LayoutLMv3ProcessorIntegrationTests(unittest.TestCase):
        # we verify our implementation on 2 document images from the DocVQA dataset
        from datasets import load_dataset

-        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True)
-
-        image_1 = Image.open(ds[0]["file"]).convert("RGB")
-        image_2 = Image.open(ds[1]["file"]).convert("RGB")
-
-        return image_1, image_2
+        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
+        return ds[0]["image"].convert("RGB"), ds[1]["image"].convert("RGB")

    @cached_property
    def get_tokenizers(self):
--- a/tests/models/layoutxlm/test_processor_layoutxlm.py
+++ b/tests/models/layoutxlm/test_processor_layoutxlm.py
@ -33,8 +33,6 @@ from ...test_processing_common import ProcessorTesterMixin


 if is_pytesseract_available():
-    from PIL import Image
-
    from transformers import LayoutLMv2ImageProcessor


@ -162,11 +160,11 @@ class LayoutXLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        from datasets import load_dataset

        # set up
-        datasets = load_dataset("nielsr/funsd", trust_remote_code=True)
+        datasets = load_dataset("nielsr/funsd")
        processor = LayoutXLMProcessor.from_pretrained("microsoft/layoutxlm-base", apply_ocr=False)

        def preprocess_data(examples):
-            images = [Image.open(path).convert("RGB") for path in examples["image_path"]]
+            images = [image.convert("RGB") for image in examples["image"]]
            words = examples["words"]
            boxes = examples["bboxes"]
            word_labels = examples["ner_tags"]
@ -200,12 +198,8 @@ class LayoutXLMProcessorIntegrationTests(unittest.TestCase):
        # we verify our implementation on 2 document images from the DocVQA dataset
        from datasets import load_dataset

-        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True)
-
-        image_1 = Image.open(ds[0]["file"]).convert("RGB")
-        image_2 = Image.open(ds[1]["file"]).convert("RGB")
-
-        return image_1, image_2
+        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
+        return ds[0]["image"].convert("RGB"), ds[1]["image"].convert("RGB")

    @cached_property
    def get_tokenizers(self):
--- a/tests/models/mobilevit/test_image_processing_mobilevit.py
+++ b/tests/models/mobilevit/test_image_processing_mobilevit.py
@ -27,8 +27,6 @@ if is_torch_available():
    import torch

 if is_vision_available():
-    from PIL import Image
-
    from transformers import MobileViTImageProcessor


@ -86,23 +84,14 @@ class MobileViTImageProcessingTester:


 def prepare_semantic_single_inputs():
-    dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-
-    image = Image.open(dataset[0]["file"])
-    map = Image.open(dataset[1]["file"])
-
-    return image, map
+    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+    example = ds[0]
+    return example["image"], example["map"]


 def prepare_semantic_batch_inputs():
-    dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-
-    image1 = Image.open(dataset[0]["file"])
-    map1 = Image.open(dataset[1]["file"])
-    image2 = Image.open(dataset[2]["file"])
-    map2 = Image.open(dataset[3]["file"])
-
-    return [image1, image2], [map1, map2]
+    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+    return list(ds["image"][:2]), list(ds["map"][:2])


@require_torch
@ -135,7 +124,6 @@ class MobileViTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
        self.assertEqual(image_processor.size, {"shortest_edge": 42})
        self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})

-    @unittest.skip("temporary to avoid failing on circleci")
    def test_call_segmentation_maps(self):
        # Initialize image_processing
        image_processing = self.image_processing_class(**self.image_processor_dict)
--- a/tests/models/nougat/test_image_processing_nougat.py
+++ b/tests/models/nougat/test_image_processing_nougat.py
@ -86,8 +86,12 @@ class NougatImageProcessingTester:
        return self.num_channels, self.size["height"], self.size["width"]

    def prepare_dummy_image(self):
+        revision = "ec57bf8c8b1653a209c13f6e9ee66b12df0fc2db"
        filepath = hf_hub_download(
-            repo_id="hf-internal-testing/fixtures_docvqa", filename="nougat_pdf.png", repo_type="dataset"
+            repo_id="hf-internal-testing/fixtures_docvqa",
+            filename="nougat_pdf.png",
+            repo_type="dataset",
+            revision=revision,
        )
        image = Image.open(filepath).convert("RGB")
        return image
@ -136,7 +140,6 @@ class NougatImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
        self.assertEqual(image_processor.size, {"height": 42, "width": 42})

-    @unittest.skip("temporary to avoid failing on circleci")
    def test_expected_output(self):
        dummy_image = self.image_processor_tester.prepare_dummy_image()
        image_processor = self.image_processor
@ -180,13 +183,16 @@ class NougatImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
        self.assertEqual((3, 100, 200), aligned_image.shape)

    def prepare_dummy_np_image(self):
+        revision = "ec57bf8c8b1653a209c13f6e9ee66b12df0fc2db"
        filepath = hf_hub_download(
-            repo_id="hf-internal-testing/fixtures_docvqa", filename="nougat_pdf.png", repo_type="dataset"
+            repo_id="hf-internal-testing/fixtures_docvqa",
+            filename="nougat_pdf.png",
+            repo_type="dataset",
+            revision=revision,
        )
        image = Image.open(filepath).convert("RGB")
        return np.array(image)

-    @unittest.skip("temporary to avoid failing on circleci")
    def test_crop_margin_equality_cv2_python(self):
        image = self.prepare_dummy_np_image()
        image_processor = self.image_processor
--- a/tests/models/perceiver/test_modeling_perceiver.py
+++ b/tests/models/perceiver/test_modeling_perceiver.py
@ -842,11 +842,8 @@ def prepare_img():

 # Helper functions for optical flow integration test
 def prepare_optical_flow_images():
-    dataset = load_dataset("hf-internal-testing/fixtures_sintel", split="test", trust_remote_code=True)
-    image1 = Image.open(dataset[0]["file"]).convert("RGB")
-    image2 = Image.open(dataset[0]["file"]).convert("RGB")
-
-    return image1, image2
+    ds = load_dataset("hf-internal-testing/fixtures_sintel", split="test")
+    return list(ds["image"][:2])


 def normalize(img):
--- a/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py
+++ b/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py
@ -33,13 +33,13 @@ from transformers import (
 from transformers.testing_utils import (
    Expectations,
    cleanup,
-    require_soundfile,
    require_torch,
    require_torch_large_accelerator,
+    require_torchcodec,
    slow,
    torch_device,
 )
-from transformers.utils import is_soundfile_available
+from transformers.utils import is_torchcodec_available

 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
@ -54,8 +54,8 @@ if is_vision_available():
    from PIL import Image


-if is_soundfile_available():
-    import soundfile
+if is_torchcodec_available():
+    import torchcodec


 class Phi4MultimodalModelTester:
@ -300,7 +300,8 @@ class Phi4MultimodalIntegrationTest(unittest.TestCase):
            tmp.write(requests.get(self.audio_url, stream=True).raw.data)
            tmp.flush()
            tmp.seek(0)
-            self.audio, self.sampling_rate = soundfile.read(tmp.name)
+            samples = torchcodec.decoders.AudioDecoder(tmp.name).get_all_samples()
+            self.audio, self.sampling_rate = samples.data, samples.sample_rate

        cleanup(torch_device, gc_collect=True)

@ -378,7 +379,7 @@ class Phi4MultimodalIntegrationTest(unittest.TestCase):

        self.assertEqual(response, EXPECTED_RESPONSE)

-    @require_soundfile
+    @require_torchcodec
    def test_audio_text_generation(self):
        model = AutoModelForCausalLM.from_pretrained(
            self.checkpoint_path, revision=self.revision, torch_dtype=torch.float16, device_map=torch_device
--- a/tests/models/segformer/test_image_processing_segformer.py
+++ b/tests/models/segformer/test_image_processing_segformer.py
@ -27,8 +27,6 @@ if is_torch_available():
    import torch

 if is_vision_available():
-    from PIL import Image
-
    from transformers import SegformerImageProcessor


@ -86,23 +84,14 @@ class SegformerImageProcessingTester:


 def prepare_semantic_single_inputs():
-    dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-
-    image = Image.open(dataset[0]["file"])
-    map = Image.open(dataset[1]["file"])
-
-    return image, map
+    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+    example = ds[0]
+    return example["image"], example["map"]


 def prepare_semantic_batch_inputs():
-    dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-
-    image1 = Image.open(dataset[0]["file"])
-    map1 = Image.open(dataset[1]["file"])
-    image2 = Image.open(dataset[2]["file"])
-    map2 = Image.open(dataset[3]["file"])
-
-    return [image1, image2], [map1, map2]
+    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+    return list(ds["image"][:2]), list(ds["map"][:2])


@require_torch
@ -138,7 +127,6 @@ class SegformerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
        self.assertEqual(image_processor.size, {"height": 42, "width": 42})
        self.assertEqual(image_processor.do_reduce_labels, True)

-    @unittest.skip("temporary to avoid failing on circleci")
    def test_call_segmentation_maps(self):
        # Initialize image_processing
        image_processing = self.image_processing_class(**self.image_processor_dict)
@ -245,7 +233,6 @@ class SegformerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
        self.assertTrue(encoding["labels"].min().item() >= 0)
        self.assertTrue(encoding["labels"].max().item() <= 255)

-    @unittest.skip("temporary to avoid failing on circleci")
    def test_reduce_labels(self):
        # Initialize image_processing
        image_processing = self.image_processing_class(**self.image_processor_dict)
--- a/tests/models/sew/test_modeling_sew.py
+++ b/tests/models/sew/test_modeling_sew.py
@ -19,7 +19,7 @@ import unittest
 import pytest

 from transformers import SEWConfig, is_torch_available
-from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device
+from transformers.testing_utils import require_torch, require_torchcodec, slow, torch_device

 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
@ -453,7 +453,7 @@ class SEWUtilsTest(unittest.TestCase):


@require_torch
-@require_soundfile
+@require_torchcodec
@slow
 class SEWModelIntegrationTest(unittest.TestCase):
    def _load_datasamples(self, num_samples):
--- a/tests/models/sew_d/test_modeling_sew_d.py
+++ b/tests/models/sew_d/test_modeling_sew_d.py
@ -19,7 +19,7 @@ import unittest
 import pytest

 from transformers import SEWDConfig, is_torch_available
-from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device
+from transformers.testing_utils import require_torch, require_torchcodec, slow, torch_device

 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
@ -464,7 +464,7 @@ class SEWDUtilsTest(unittest.TestCase):


@require_torch
-@require_soundfile
+@require_torchcodec
@slow
 class SEWDModelIntegrationTest(unittest.TestCase):
    def _load_datasamples(self, num_samples):
--- a/tests/models/udop/test_modeling_udop.py
+++ b/tests/models/udop/test_modeling_udop.py
@ -16,9 +16,9 @@ import copy
 import inspect
 import unittest

-from huggingface_hub import hf_hub_download
+from datasets import load_dataset

-from transformers import UdopConfig, is_torch_available, is_vision_available
+from transformers import UdopConfig, is_torch_available
 from transformers.testing_utils import (
    require_sentencepiece,
    require_tokenizers,
@ -42,10 +42,6 @@ if is_torch_available():
    from transformers import UdopEncoderModel, UdopForConditionalGeneration, UdopModel, UdopProcessor


-if is_vision_available():
-    from PIL import Image
-
-
 class UdopModelTester:
    def __init__(
        self,
@ -618,12 +614,8 @@ class UdopEncoderOnlyModelTest(ModelTesterMixin, unittest.TestCase):
 class UdopModelIntegrationTests(unittest.TestCase):
    @cached_property
    def image(self):
-        filepath = hf_hub_download(
-            repo_id="hf-internal-testing/fixtures_docvqa", filename="document_2.png", repo_type="dataset"
-        )
-        image = Image.open(filepath).convert("RGB")
-
-        return image
+        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
+        return ds[1]["image"]

    @cached_property
    def processor(self):
--- a/tests/models/udop/test_processor_udop.py
+++ b/tests/models/udop/test_processor_udop.py
@ -41,8 +41,6 @@ if is_torch_available():


 if is_pytesseract_available():
-    from PIL import Image
-
    from transformers import LayoutLMv3ImageProcessor


@ -184,11 +182,11 @@ class UdopProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        from datasets import load_dataset

        # set up
-        datasets = load_dataset("nielsr/funsd", trust_remote_code=True)
+        datasets = load_dataset("nielsr/funsd")
        processor = UdopProcessor.from_pretrained("microsoft/udop-large", apply_ocr=False)

        def preprocess_data(examples):
-            images = [Image.open(path).convert("RGB") for path in examples["image_path"]]
+            images = [image.convert("RGB") for image in examples["image"]]
            words = examples["words"]
            boxes = examples["bboxes"]
            word_labels = examples["ner_tags"]
@ -222,12 +220,8 @@ class UdopProcessorIntegrationTests(unittest.TestCase):
        # we verify our implementation on 2 document images from the DocVQA dataset
        from datasets import load_dataset

-        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True)
-
-        image_1 = Image.open(ds[0]["file"]).convert("RGB")
-        image_2 = Image.open(ds[1]["file"]).convert("RGB")
-
-        return image_1, image_2
+        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
+        return ds[0]["image"].convert("RGB"), ds[1]["image"].convert("RGB")

    @cached_property
    def get_tokenizers(self):
--- a/tests/models/unispeech/test_modeling_unispeech.py
+++ b/tests/models/unispeech/test_modeling_unispeech.py
@ -21,7 +21,7 @@ import pytest
 from datasets import load_dataset

 from transformers import UniSpeechConfig, is_torch_available
-from transformers.testing_utils import is_flaky, require_soundfile, require_torch, slow, torch_device
+from transformers.testing_utils import is_flaky, require_torch, require_torchcodec, slow, torch_device

 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
@ -553,7 +553,7 @@ class UniSpeechRobustModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.T


@require_torch
-@require_soundfile
+@require_torchcodec
@slow
 class UniSpeechModelIntegrationTest(unittest.TestCase):
    def _load_datasamples(self, num_samples):
@ -566,7 +566,7 @@ class UniSpeechModelIntegrationTest(unittest.TestCase):
        return [x["array"] for x in speech_samples]

    def _load_superb(self, task, num_samples):
-        ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True)
+        ds = load_dataset("anton-l/superb_dummy", task, split="test")

        return ds[:num_samples]

--- a/tests/models/unispeech_sat/test_modeling_unispeech_sat.py
+++ b/tests/models/unispeech_sat/test_modeling_unispeech_sat.py
@ -21,7 +21,7 @@ import pytest
 from datasets import load_dataset

 from transformers import UniSpeechSatConfig, is_torch_available
-from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device
+from transformers.testing_utils import require_torch, require_torchcodec, slow, torch_device

 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
@ -807,7 +807,7 @@ class UniSpeechSatRobustModelTest(ModelTesterMixin, unittest.TestCase):


@require_torch
-@require_soundfile
+@require_torchcodec
@slow
 class UniSpeechSatModelIntegrationTest(unittest.TestCase):
    def _load_datasamples(self, num_samples):
@ -820,7 +820,7 @@ class UniSpeechSatModelIntegrationTest(unittest.TestCase):
        return [x["array"] for x in speech_samples]

    def _load_superb(self, task, num_samples):
-        ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True)
+        ds = load_dataset("anton-l/superb_dummy", task, split="test")

        return ds[:num_samples]

--- a/tests/models/upernet/test_modeling_upernet.py
+++ b/tests/models/upernet/test_modeling_upernet.py
@ -15,7 +15,7 @@

 import unittest

-from huggingface_hub import hf_hub_download
+from datasets import load_dataset

 from transformers import ConvNextConfig, UperNetConfig
 from transformers.testing_utils import (
@ -41,8 +41,6 @@ if is_torch_available():


 if is_vision_available():
-    from PIL import Image
-
    from transformers import AutoImageProcessor


@ -277,11 +275,8 @@ class UperNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)

 # We will verify our results on an image of ADE20k
 def prepare_img():
-    filepath = hf_hub_download(
-        repo_id="hf-internal-testing/fixtures_ade20k", repo_type="dataset", filename="ADE_val_00000001.jpg"
-    )
-    image = Image.open(filepath).convert("RGB")
-    return image
+    ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+    return ds[0]["image"].convert("RGB")


@require_torch
--- a/tests/models/vilt/test_modeling_vilt.py
+++ b/tests/models/vilt/test_modeling_vilt.py
@ -637,9 +637,9 @@ class ViltModelIntegrationTest(unittest.TestCase):

        processor = self.default_processor

-        dataset = load_dataset("hf-internal-testing/fixtures_nlvr2", split="test", trust_remote_code=True)
-        image1 = Image.open(dataset[0]["file"]).convert("RGB")
-        image2 = Image.open(dataset[1]["file"]).convert("RGB")
+        dataset = load_dataset("hf-internal-testing/fixtures_nlvr2", split="train")
+        image1 = dataset[0]["image"]
+        image2 = dataset[1]["image"]

        text = (
            "The left image contains twice the number of dogs as the right image, and at least two dogs in total are"
--- a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
+++ b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
@ -1149,8 +1149,8 @@ class TrOCRModelIntegrationTest(unittest.TestCase):
    def test_inference_handwritten(self):
        model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten").to(torch_device)

-        dataset = load_dataset("hf-internal-testing/fixtures_ocr", split="test", trust_remote_code=True)
-        image = Image.open(dataset[0]["file"]).convert("RGB")
+        dataset = load_dataset("hf-internal-testing/fixtures_ocr", split="train")
+        image = dataset[0]["image"].convert("RGB")

        processor = self.default_processor
        pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(torch_device)
@ -1174,8 +1174,8 @@ class TrOCRModelIntegrationTest(unittest.TestCase):
    def test_inference_printed(self):
        model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed").to(torch_device)

-        dataset = load_dataset("hf-internal-testing/fixtures_ocr", split="test", trust_remote_code=True)
-        image = Image.open(dataset[1]["file"]).convert("RGB")
+        dataset = load_dataset("hf-internal-testing/fixtures_ocr", split="train")
+        image = dataset[0]["image"].convert("RGB")

        processor = self.default_processor
        pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(torch_device)
--- a/tests/models/wav2vec2/test_modeling_wav2vec2.py
+++ b/tests/models/wav2vec2/test_modeling_wav2vec2.py
@ -34,10 +34,10 @@ from transformers.testing_utils import (
    is_torchaudio_available,
    require_flash_attn,
    require_pyctcdecode,
-    require_soundfile,
    require_torch,
    require_torch_gpu,
    require_torchaudio,
+    require_torchcodec,
    run_test_in_subprocess,
    slow,
    torch_device,
@ -97,9 +97,7 @@ def _test_wav2vec2_with_lm_invalid_pool(in_queue, out_queue, timeout):
    try:
        _ = in_queue.get(timeout=timeout)

-        ds = load_dataset(
-            "mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True, trust_remote_code=True
-        )
+        ds = load_dataset("mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True)
        sample = next(iter(ds))

        resampled_audio = torchaudio.functional.resample(
@ -1452,7 +1450,7 @@ class Wav2Vec2UtilsTest(unittest.TestCase):


@require_torch
-@require_soundfile
+@require_torchcodec
@slow
 class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
    def tearDown(self):
@ -1470,7 +1468,7 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
        return [x["array"] for x in speech_samples]

    def _load_superb(self, task, num_samples):
-        ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True)
+        ds = load_dataset("anton-l/superb_dummy", task, split="test")

        return ds[:num_samples]

@ -1836,9 +1834,7 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
    @require_pyctcdecode
    @require_torchaudio
    def test_wav2vec2_with_lm(self):
-        ds = load_dataset(
-            "mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True, trust_remote_code=True
-        )
+        ds = load_dataset("mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True)
        sample = next(iter(ds))

        resampled_audio = torchaudio.functional.resample(
@ -1862,9 +1858,7 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
    @require_pyctcdecode
    @require_torchaudio
    def test_wav2vec2_with_lm_pool(self):
-        ds = load_dataset(
-            "mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True, trust_remote_code=True
-        )
+        ds = load_dataset("mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True)
        sample = next(iter(ds))

        resampled_audio = torchaudio.functional.resample(
@ -1963,9 +1957,7 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
        LANG_MAP = {"it": "ita", "es": "spa", "fr": "fra", "en": "eng"}

        def run_model(lang):
-            ds = load_dataset(
-                "mozilla-foundation/common_voice_11_0", lang, split="test", streaming=True, trust_remote_code=True
-            )
+            ds = load_dataset("mozilla-foundation/common_voice_11_0", lang, split="test", streaming=True)
            sample = next(iter(ds))

            wav2vec2_lang = LANG_MAP[lang]
--- a/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
+++ b/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
@ -463,9 +463,7 @@ class Wav2Vec2ProcessorWithLMTest(unittest.TestCase):
    def test_word_time_stamp_integration(self):
        import torch

-        ds = load_dataset(
-            "mozilla-foundation/common_voice_11_0", "en", split="train", streaming=True, trust_remote_code=True
-        )
+        ds = load_dataset("mozilla-foundation/common_voice_11_0", "en", split="train", streaming=True)
        ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000))
        ds_iter = iter(ds)
        sample = next(ds_iter)
--- a/tests/models/wavlm/test_modeling_wavlm.py
+++ b/tests/models/wavlm/test_modeling_wavlm.py
@ -473,7 +473,7 @@ class WavLMModelIntegrationTest(unittest.TestCase):
        return [x["array"] for x in speech_samples]

    def _load_superb(self, task, num_samples):
-        ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True)
+        ds = load_dataset("anton-l/superb_dummy", task, split="test")

        return ds[:num_samples]

--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@ -1645,9 +1645,7 @@ class WhisperModelIntegrationTests(unittest.TestCase):
        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3")
        model.to(torch_device)

-        ds = load_dataset(
-            "facebook/multilingual_librispeech", "german", split="test", streaming=True, trust_remote_code=True
-        )
+        ds = load_dataset("facebook/multilingual_librispeech", "german", split="test", streaming=True)
        ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000))

        input_speech = next(iter(ds))["audio"]["array"]
@ -1714,11 +1712,10 @@ class WhisperModelIntegrationTests(unittest.TestCase):

        token = os.getenv("HF_HUB_READ_TOKEN", True)
        ds = load_dataset(
-            "mozilla-foundation/common_voice_6_1",
+            "hf-internal-testing/fixtures_common_voice",
            "ja",
            split="test",
            streaming=True,
-            trust_remote_code=True,
            token=token,
        )
        ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000))
@ -1728,7 +1725,10 @@ class WhisperModelIntegrationTests(unittest.TestCase):
            torch_device
        )

-        EXPECTED_TRANSCRIPTS = ["木村さんに電話を貸してもらいました", " Kimura-san called me."]
+        EXPECTED_TRANSCRIPTS = [
+            "夏の時期の時期でした",
+            " It was the time of day and all of the pens left during the summer.",
+        ]

        generated_ids = model.generate(
            input_features.repeat(2, 1, 1),
--- a/tests/pipelines/test_pipelines_audio_classification.py
+++ b/tests/pipelines/test_pipelines_audio_classification.py
@ -179,7 +179,7 @@ class AudioClassificationPipelineTests(unittest.TestCase):
        model = "superb/wav2vec2-base-superb-ks"

        audio_classifier = pipeline("audio-classification", model=model)
-        dataset = datasets.load_dataset("anton-l/superb_dummy", "ks", split="test", trust_remote_code=True)
+        dataset = datasets.load_dataset("anton-l/superb_dummy", "ks", split="test")

        audio = np.array(dataset[3]["speech"], dtype=np.float32)
        output = audio_classifier(audio, top_k=4)
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@ -265,9 +265,7 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
    @require_torch
    @require_pyctcdecode
    def test_large_model_pt_with_lm(self):
-        dataset = load_dataset("Narsil/asr_dummy", streaming=True, trust_remote_code=True)
-        third_item = next(iter(dataset["test"].skip(3)))
-        filename = third_item["file"]
+        filename = hf_hub_download("Narsil/asr_dummy", filename="4.flac", repo_type="dataset")

        speech_recognizer = pipeline(
            task="automatic-speech-recognition",
@ -388,7 +386,7 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
            chunk_length_s=8,
            stride_length_s=1,
        )
-        data = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True, trust_remote_code=True)
+        data = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True)
        sample = next(iter(data))

        res = pipe(sample["audio"]["array"])
@ -434,7 +432,7 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
            stride_length_s=1,
            return_language=True,
        )
-        data = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True, trust_remote_code=True)
+        data = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True)
        sample = next(iter(data))

        res = pipe(sample["audio"]["array"])
@ -489,7 +487,7 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
            task="automatic-speech-recognition",
            model="openai/whisper-tiny.en",
        )
-        data = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True, trust_remote_code=True)
+        data = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True)
        samples = [next(iter(data)) for _ in range(8)]
        audio = np.concatenate([sample["audio"]["array"] for sample in samples])

@ -1125,9 +1123,7 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
    @slow
    def test_speculative_decoding_whisper_non_distil(self):
        # Load data:
-        dataset = load_dataset(
-            "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]", trust_remote_code=True
-        )
+        dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]")
        sample = dataset[0]["audio"]

        # Load model:
@ -1169,9 +1165,7 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
    @slow
    def test_speculative_decoding_whisper_distil(self):
        # Load data:
-        dataset = load_dataset(
-            "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]", trust_remote_code=True
-        )
+        dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]")
        sample = dataset[0]["audio"]

        # Load model:
--- a/tests/pipelines/test_pipelines_image_segmentation.py
+++ b/tests/pipelines/test_pipelines_image_segmentation.py
@ -601,9 +601,9 @@ class ImageSegmentationPipelineTests(unittest.TestCase):

        image_segmenter = pipeline("image-segmentation", model=model, image_processor=image_processor)

-        image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-        file = image[0]["file"]
-        outputs = image_segmenter(file, threshold=threshold)
+        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+        image = ds[0]["image"].convert("RGB")
+        outputs = image_segmenter(image, threshold=threshold)

        # Shortening by hashing
        for o in outputs:
@ -655,9 +655,9 @@ class ImageSegmentationPipelineTests(unittest.TestCase):
    def test_oneformer(self):
        image_segmenter = pipeline(model="shi-labs/oneformer_ade20k_swin_tiny")

-        image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-        file = image[0]["file"]
-        outputs = image_segmenter(file, threshold=0.99)
+        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
+        image = ds[0]["image"].convert("RGB")
+        outputs = image_segmenter(image, threshold=0.99)
        # Shortening by hashing
        for o in outputs:
            o["mask"] = mask_to_test_readable(o["mask"])
@ -679,7 +679,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase):
        )

        # Different task
-        outputs = image_segmenter(file, threshold=0.99, subtask="instance")
+        outputs = image_segmenter(image, threshold=0.99, subtask="instance")
        # Shortening by hashing
        for o in outputs:
            o["mask"] = mask_to_test_readable(o["mask"])
@ -701,7 +701,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase):
        )

        # Different task
-        outputs = image_segmenter(file, subtask="semantic")
+        outputs = image_segmenter(image, subtask="semantic")
        # Shortening by hashing
        for o in outputs:
            o["mask"] = mask_to_test_readable(o["mask"])
Author	SHA1	Message	Date
Quentin Lhoest	666964a173	install ffmpeg and torchcodec in ci	2025-07-01 18:11:26 +02:00
Quentin Lhoest	06cf39c89e	add torchcodec to torch tests requirements	2025-07-01 17:22:52 +02:00
Quentin Lhoest	edeb984e69	fix import	2025-06-27 15:59:32 +02:00
Quentin Lhoest	93ce42fd63	style	2025-06-26 18:16:29 +02:00
Quentin Lhoest	128660d913	soundfile -> torchcodec	2025-06-26 18:02:42 +02:00
Quentin Lhoest	5d118a11c0	test datasets@main	2025-06-25 17:19:01 +02:00
Quentin Lhoest	6cf0a520ee	last one	2025-06-24 19:36:23 +02:00
Quentin Lhoest	3b99fd273f	style	2025-06-24 16:51:28 +02:00
Quentin Lhoest	d48c569504	fix the last ones	2025-06-24 16:51:19 +02:00
Quentin Lhoest	52f2dbe9ef	fix	2025-06-23 17:09:46 +02:00
Quentin Lhoest	f6c540db72	fix	2025-06-23 17:07:16 +02:00
Quentin Lhoest	f884fa5b1b	style	2025-06-23 16:54:11 +02:00
Quentin Lhoest	9db4ddb1b9	fix tests	2025-06-23 16:53:40 +02:00
Quentin Lhoest	9b2afaf02d	fix integration test	2025-06-20 19:54:27 +02:00
Quentin Lhoest	d188134b95	style	2025-06-20 17:43:26 +02:00
Quentin Lhoest	e2ed15c465	again	2025-06-20 17:42:17 +02:00
Quentin Lhoest	005459827e	again	2025-06-20 14:44:38 +02:00
Quentin Lhoest	69419a4935	style	2025-06-20 14:37:26 +02:00
Quentin Lhoest	1fdb9f3908	again	2025-06-20 13:19:50 +02:00
Quentin Lhoest	3dfebf2fc0	Revert "Skip some tests for now (#38931 )" This reverts commit 31d30b72245aacfdf70249165964b53790d9c4d8.	2025-06-20 13:00:47 +02:00
Quentin Lhoest	e6093deb18	again	2025-06-20 13:00:25 +02:00
Quentin Lhoest	b7ec09c2f4	remove trust_remote_code	2025-06-20 13:00:25 +02:00