Compare commits

...

8 Commits

Author SHA1 Message Date
290ae0b0a3 fix 2025-07-02 14:37:59 +02:00
5586fe7be2 trigger CI with datasets main 2025-07-01 22:17:14 +02:00
d92a22fd6d remove duplicate 2025-07-01 22:11:25 +02:00
45a8a4a6e4 torchcodec in dockerfiles and requirements 2025-07-01 22:11:25 +02:00
986698a6a7 torchcodec in docstrings and testing utils 2025-07-01 22:11:25 +02:00
d20ab64120 keep same dataset actually 2025-07-01 22:11:25 +02:00
79c9aece59 bump version 2025-07-01 22:11:25 +02:00
c1223639b9 fix dataset run_object_detection 2025-07-01 22:11:25 +02:00
46 changed files with 232 additions and 243 deletions

View File

@ -97,6 +97,14 @@ jobs:
run: | run: |
python3 utils/print_env.py python3 utils/print_env.py
- name: Install datasets main
working-directory: /transformers
run: python3 -m pip install --no-cache-dir git+https://github.com/huggingface/datasets.git@main
- name: Install torchcodec
working-directory: /transformers
run: python3 -m pip install --no-cache-dir torch torchvision torchaudio torchcodec --index-url https://download.pytorch.org/whl/cu126
- name: Show installed libraries and their versions - name: Show installed libraries and their versions
working-directory: /transformers working-directory: /transformers
run: pip freeze run: pip freeze

View File

@ -7,7 +7,7 @@ on:
- cron: "17 2 * * *" - cron: "17 2 * * *"
push: push:
branches: branches:
- run_scheduled_ci* - fix-dataset-run_object_detection-and-add-torchcodec-trigger-ci
workflow_dispatch: workflow_dispatch:
inputs: inputs:
prev_workflow_run_id: prev_workflow_run_id:
@ -24,7 +24,7 @@ on:
# Used for `push` to easily modify the target workflow runs to compare against # Used for `push` to easily modify the target workflow runs to compare against
env: env:
prev_workflow_run_id: "" prev_workflow_run_id: "15988665799"
other_workflow_run_id: "" other_workflow_run_id: ""
@ -50,64 +50,8 @@ jobs:
uses: ./.github/workflows/self-scheduled.yml uses: ./.github/workflows/self-scheduled.yml
with: with:
job: run_models_gpu job: run_models_gpu
slack_report_channel: "#transformers-ci-daily-models" slack_report_channel: "#transformers-dummy"
docker: huggingface/transformers-all-latest-gpu docker: huggingface/transformers-all-latest-gpu
ci_event: Daily CI ci_event: Daily CI
report_repo_id: hf-internal-testing/transformers_daily_ci report_repo_id: hf-internal-testing/transformers_daily_ci
secrets: inherit secrets: inherit
torch-pipeline:
name: Torch pipeline CI
uses: ./.github/workflows/self-scheduled.yml
with:
job: run_pipelines_torch_gpu
slack_report_channel: "#transformers-ci-daily-pipeline-torch"
docker: huggingface/transformers-pytorch-gpu
ci_event: Daily CI
report_repo_id: hf-internal-testing/transformers_daily_ci
secrets: inherit
example-ci:
name: Example CI
uses: ./.github/workflows/self-scheduled.yml
with:
job: run_examples_gpu
slack_report_channel: "#transformers-ci-daily-examples"
docker: huggingface/transformers-all-latest-gpu
ci_event: Daily CI
report_repo_id: hf-internal-testing/transformers_daily_ci
secrets: inherit
trainer-fsdp-ci:
name: Trainer/FSDP CI
uses: ./.github/workflows/self-scheduled.yml
with:
job: run_trainer_and_fsdp_gpu
slack_report_channel: "#transformers-ci-daily-training"
docker: huggingface/transformers-all-latest-gpu
ci_event: Daily CI
report_repo_id: hf-internal-testing/transformers_daily_ci
secrets: inherit
deepspeed-ci:
name: DeepSpeed CI
uses: ./.github/workflows/self-scheduled.yml
with:
job: run_torch_cuda_extensions_gpu
slack_report_channel: "#transformers-ci-daily-training"
docker: huggingface/transformers-pytorch-deepspeed-latest-gpu
ci_event: Daily CI
working-directory-prefix: /workspace
report_repo_id: hf-internal-testing/transformers_daily_ci
secrets: inherit
quantization-ci:
name: Quantization CI
uses: ./.github/workflows/self-scheduled.yml
with:
job: run_quantization_torch_gpu
slack_report_channel: "#transformers-ci-daily-quantization"
docker: huggingface/transformers-quantization-latest-gpu
ci_event: Daily CI
report_repo_id: hf-internal-testing/transformers_daily_ci
secrets: inherit

View File

@ -2,10 +2,10 @@ FROM python:3.9-slim
ENV PYTHONDONTWRITEBYTECODE=1 ENV PYTHONDONTWRITEBYTECODE=1
ARG REF=main ARG REF=main
USER root USER root
RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git ffmpeg
ENV UV_PYTHON=/usr/local/bin/python ENV UV_PYTHON=/usr/local/bin/python
RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]" seqeval albumentations jiwer RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]" seqeval albumentations jiwer
RUN uv pip uninstall transformers RUN uv pip uninstall transformers

View File

@ -2,10 +2,10 @@ FROM python:3.9-slim
ENV PYTHONDONTWRITEBYTECODE=1 ENV PYTHONDONTWRITEBYTECODE=1
ARG REF=main ARG REF=main
USER root USER root
RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git pkg-config openssh-client git RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git pkg-config openssh-client git ffmpeg
ENV UV_PYTHON=/usr/local/bin/python ENV UV_PYTHON=/usr/local/bin/python
RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]" RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]"
RUN uv pip uninstall transformers RUN uv pip uninstall transformers

View File

@ -2,10 +2,10 @@ FROM python:3.9-slim
ENV PYTHONDONTWRITEBYTECODE=1 ENV PYTHONDONTWRITEBYTECODE=1
ARG REF=main ARG REF=main
USER root USER root
RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git git-lfs RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git git-lfs ffmpeg
ENV UV_PYTHON=/usr/local/bin/python ENV UV_PYTHON=/usr/local/bin/python
RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing,tiktoken,num2words,video]" RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing,tiktoken,num2words,video]"
RUN uv pip uninstall transformers RUN uv pip uninstall transformers

View File

@ -64,15 +64,15 @@ predicted token ids.
>>> import torch >>> import torch
>>> from transformers import Speech2Text2Processor, SpeechEncoderDecoderModel >>> from transformers import Speech2Text2Processor, SpeechEncoderDecoderModel
>>> from datasets import load_dataset >>> from datasets import load_dataset
>>> import soundfile as sf >>> from torchcodec.decoders import AudioDecoder
>>> model = SpeechEncoderDecoderModel.from_pretrained("facebook/s2t-wav2vec2-large-en-de") >>> model = SpeechEncoderDecoderModel.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
>>> processor = Speech2Text2Processor.from_pretrained("facebook/s2t-wav2vec2-large-en-de") >>> processor = Speech2Text2Processor.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
>>> def map_to_array(batch): >>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"]) ... decoder = AudioDecoder(batch["file"])
... batch["speech"] = speech ... batch["speech"] = decoder.get_all_samples().data
... return batch ... return batch

View File

@ -22,6 +22,7 @@ protobuf
torch torch
torchvision torchvision
torchaudio torchaudio
torchcodec
jiwer jiwer
librosa librosa
evaluate >= 0.2.0 evaluate >= 0.2.0

View File

@ -1,5 +1,5 @@
albumentations >= 1.4.16 albumentations >= 1.4.16
timm timm
datasets datasets>=4.0
torchmetrics torchmetrics
pycocotools pycocotools

View File

@ -399,7 +399,7 @@ def main():
dataset["validation"] = split["test"] dataset["validation"] = split["test"]
# Get dataset categories and prepare mappings for label_name <-> label_id # Get dataset categories and prepare mappings for label_name <-> label_id
categories = dataset["train"].features["objects"].feature["category"].names categories = dataset["train"].features["objects"]["category"].feature.names
id2label = dict(enumerate(categories)) id2label = dict(enumerate(categories))
label2id = {v: k for k, v in id2label.items()} label2id = {v: k for k, v in id2label.items()}

View File

@ -460,7 +460,7 @@ def main():
dataset["validation"] = split["test"] dataset["validation"] = split["test"]
# Get dataset categories and prepare mappings for label_name <-> label_id # Get dataset categories and prepare mappings for label_name <-> label_id
categories = dataset["train"].features["objects"].feature["category"].names categories = dataset["train"].features["objects"]["category"].feature.names
id2label = dict(enumerate(categories)) id2label = dict(enumerate(categories))
label2id = {v: k for k, v in id2label.items()} label2id = {v: k for k, v in id2label.items()}

View File

@ -436,9 +436,10 @@ class ASTModel(ASTPreTrainedModel):
input_values (`torch.FloatTensor` of shape `(batch_size, max_length, num_mel_bins)`): input_values (`torch.FloatTensor` of shape `(batch_size, max_length, num_mel_bins)`):
Float values mel features extracted from the raw audio waveform. Raw audio waveform can be obtained by Float values mel features extracted from the raw audio waveform. Raw audio waveform can be obtained by
loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via
the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
[`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the
tensor of type `torch.FloatTensor`. See [`~ASTFeatureExtractor.__call__`] mel features, padding and conversion into a tensor of type `torch.FloatTensor`.
See [`~ASTFeatureExtractor.__call__`]
""" """
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = ( output_hidden_states = (
@ -526,9 +527,10 @@ class ASTForAudioClassification(ASTPreTrainedModel):
input_values (`torch.FloatTensor` of shape `(batch_size, max_length, num_mel_bins)`): input_values (`torch.FloatTensor` of shape `(batch_size, max_length, num_mel_bins)`):
Float values mel features extracted from the raw audio waveform. Raw audio waveform can be obtained by Float values mel features extracted from the raw audio waveform. Raw audio waveform can be obtained by
loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via
the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
[`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the
tensor of type `torch.FloatTensor`. See [`~ASTFeatureExtractor.__call__`] mel features, padding and conversion into a tensor of type `torch.FloatTensor`.
See [`~ASTFeatureExtractor.__call__`]
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the audio classification/regression loss. Indices should be in `[0, ..., Labels for computing the audio classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If

View File

@ -1053,9 +1053,10 @@ class Data2VecAudioForSequenceClassification(Data2VecAudioPreTrainedModel):
r""" r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
conversion into a tensor of type `torch.FloatTensor`. See [`Data2VecAudioProcessor.__call__`] for details. the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`Data2VecAudioProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@ -1167,9 +1168,10 @@ class Data2VecAudioForAudioFrameClassification(Data2VecAudioPreTrainedModel):
r""" r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
conversion into a tensor of type `torch.FloatTensor`. See [`Data2VecAudioProcessor.__call__`] for details. the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`Data2VecAudioProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@ -1349,9 +1351,10 @@ class Data2VecAudioForXVector(Data2VecAudioPreTrainedModel):
r""" r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
conversion into a tensor of type `torch.FloatTensor`. See [`Data2VecAudioProcessor.__call__`] for details. the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`Data2VecAudioProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If

View File

@ -985,15 +985,15 @@ class HubertModel(HubertPreTrainedModel):
```python ```python
>>> from transformers import AutoProcessor, HubertModel >>> from transformers import AutoProcessor, HubertModel
>>> from datasets import load_dataset >>> from datasets import load_dataset
>>> import soundfile as sf >>> from torchcodec.decoders import AudioDecoder
>>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft") >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
>>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft") >>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
>>> def map_to_array(batch): >>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"]) ... decoder = AudioDecoder(batch["file"])
... batch["speech"] = speech ... batch["speech"] = decoder.get_all_samples().data
... return batch ... return batch
@ -1261,9 +1261,10 @@ class HubertForSequenceClassification(HubertPreTrainedModel):
r""" r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
conversion into a tensor of type `torch.FloatTensor`. See [`HubertProcessor.__call__`] for details. the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`HubertProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If

View File

@ -1459,15 +1459,15 @@ class TFHubertModel(TFHubertPreTrainedModel):
```python ```python
>>> from transformers import AutoProcessor, TFHubertModel >>> from transformers import AutoProcessor, TFHubertModel
>>> from datasets import load_dataset >>> from datasets import load_dataset
>>> import soundfile as sf >>> from torchcodec.decoders import AudioDecoder
>>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft") >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
>>> model = TFHubertModel.from_pretrained("facebook/hubert-large-ls960-ft") >>> model = TFHubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
>>> def map_to_array(batch): >>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"]) ... decoder = AudioDecoder(batch["file"])
... batch["speech"] = speech ... batch["speech"] = decoder.get_all_samples().data
... return batch ... return batch
@ -1571,15 +1571,15 @@ class TFHubertForCTC(TFHubertPreTrainedModel):
>>> import tensorflow as tf >>> import tensorflow as tf
>>> from transformers import AutoProcessor, TFHubertForCTC >>> from transformers import AutoProcessor, TFHubertForCTC
>>> from datasets import load_dataset >>> from datasets import load_dataset
>>> import soundfile as sf >>> from torchcodec.decoders import AudioDecoder
>>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft") >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
>>> model = TFHubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft") >>> model = TFHubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")
>>> def map_to_array(batch): >>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"]) ... decoder = AudioDecoder(batch["file"])
... batch["speech"] = speech ... batch["speech"] = decoder.get_all_samples().data
... return batch ... return batch

View File

@ -239,15 +239,15 @@ class HubertModel(Wav2Vec2Model, HubertPreTrainedModel):
```python ```python
>>> from transformers import AutoProcessor, HubertModel >>> from transformers import AutoProcessor, HubertModel
>>> from datasets import load_dataset >>> from datasets import load_dataset
>>> import soundfile as sf >>> from torchcodec.decoders import AudioDecoder
>>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft") >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
>>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft") >>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
>>> def map_to_array(batch): >>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"]) ... decoder = AudioDecoder(batch["file"])
... batch["speech"] = speech ... batch["speech"] = decoder.get_all_samples().data
... return batch ... return batch

View File

@ -563,7 +563,8 @@ class MoonshineEncoder(MoonshinePreTrainedModel):
input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
Float values of the raw speech waveform. Raw speech waveform can be Float values of the raw speech waveform. Raw speech waveform can be
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
the soundfile library (`pip install soundfile`). To prepare the array into
`input_values`, the [`AutoFeatureExtractor`] should be used for padding `input_values`, the [`AutoFeatureExtractor`] should be used for padding
and conversion into a tensor of type `torch.FloatTensor`. and conversion into a tensor of type `torch.FloatTensor`.
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
@ -1028,7 +1029,8 @@ class MoonshineModel(MoonshinePreTrainedModel):
input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
Float values of the raw speech waveform. Raw speech waveform can be Float values of the raw speech waveform. Raw speech waveform can be
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
the soundfile library (`pip install soundfile`). To prepare the array into
`input_values`, the [`AutoFeatureExtractor`] should be used for padding `input_values`, the [`AutoFeatureExtractor`] should be used for padding
and conversion into a tensor of type `torch.FloatTensor`. and conversion into a tensor of type `torch.FloatTensor`.
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): decoder_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
@ -1204,7 +1206,8 @@ class MoonshineForConditionalGeneration(MoonshinePreTrainedModel, GenerationMixi
input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
Float values of the raw speech waveform. Raw speech waveform can be Float values of the raw speech waveform. Raw speech waveform can be
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
the soundfile library (`pip install soundfile`). To prepare the array into
`input_values`, the [`AutoFeatureExtractor`] should be used for padding `input_values`, the [`AutoFeatureExtractor`] should be used for padding
and conversion into a tensor of type `torch.FloatTensor`. and conversion into a tensor of type `torch.FloatTensor`.
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): decoder_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):

View File

@ -587,7 +587,8 @@ class MoonshineEncoder(MoonshinePreTrainedModel):
input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
Float values of the raw speech waveform. Raw speech waveform can be Float values of the raw speech waveform. Raw speech waveform can be
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
the soundfile library (`pip install soundfile`). To prepare the array into
`input_values`, the [`AutoFeatureExtractor`] should be used for padding `input_values`, the [`AutoFeatureExtractor`] should be used for padding
and conversion into a tensor of type `torch.FloatTensor`. and conversion into a tensor of type `torch.FloatTensor`.
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
@ -844,7 +845,8 @@ class MoonshineModel(WhisperModel):
input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
Float values of the raw speech waveform. Raw speech waveform can be Float values of the raw speech waveform. Raw speech waveform can be
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
the soundfile library (`pip install soundfile`). To prepare the array into
`input_values`, the [`AutoFeatureExtractor`] should be used for padding `input_values`, the [`AutoFeatureExtractor`] should be used for padding
and conversion into a tensor of type `torch.FloatTensor`. and conversion into a tensor of type `torch.FloatTensor`.
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): decoder_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
@ -1004,7 +1006,8 @@ class MoonshineForConditionalGeneration(MoonshinePreTrainedModel, GenerationMixi
input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
Float values of the raw speech waveform. Raw speech waveform can be Float values of the raw speech waveform. Raw speech waveform can be
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
the soundfile library (`pip install soundfile`). To prepare the array into
`input_values`, the [`AutoFeatureExtractor`] should be used for padding `input_values`, the [`AutoFeatureExtractor`] should be used for padding
and conversion into a tensor of type `torch.FloatTensor`. and conversion into a tensor of type `torch.FloatTensor`.
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): decoder_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):

View File

@ -797,7 +797,8 @@ class Qwen2_5OmniAudioEncoder(Qwen2_5OmniPreTrainedModel):
input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`): input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
the soundfile library (`pip install soundfile`). To prepare the array into
`input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
feature_lens (`torch.LongTensor` of shape `(batch_size,)`): feature_lens (`torch.LongTensor` of shape `(batch_size,)`):
@ -1789,9 +1790,10 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo
input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, feature_sequence_length)`): input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, feature_sequence_length)`):
Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via
the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
[`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the
tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] mel features, padding and conversion into a tensor of type `torch.FloatTensor`.
See [`~WhisperFeatureExtractor.__call__`]
pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size), *optional*): pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size), *optional*):
The tensors corresponding to the input videos. Pixel values can be obtained using The tensors corresponding to the input videos. Pixel values can be obtained using
[`AutoImageProcessor`]. See [`SiglipImageProcessor.__call__`] for details ([]`NewTaskModelProcessor`] uses [`AutoImageProcessor`]. See [`SiglipImageProcessor.__call__`] for details ([]`NewTaskModelProcessor`] uses

View File

@ -1782,7 +1782,8 @@ class Qwen2_5OmniAudioEncoder(Qwen2_5OmniPreTrainedModel):
input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`): input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
the soundfile library (`pip install soundfile`). To prepare the array into
`input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
feature_lens (`torch.LongTensor` of shape `(batch_size,)`): feature_lens (`torch.LongTensor` of shape `(batch_size,)`):
@ -2236,9 +2237,10 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo
input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, feature_sequence_length)`): input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, feature_sequence_length)`):
Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via
the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
[`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the
tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] mel features, padding and conversion into a tensor of type `torch.FloatTensor`.
See [`~WhisperFeatureExtractor.__call__`]
pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size), *optional*): pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size), *optional*):
The tensors corresponding to the input videos. Pixel values can be obtained using The tensors corresponding to the input videos. Pixel values can be obtained using
[`AutoImageProcessor`]. See [`SiglipImageProcessor.__call__`] for details ([]`NewTaskModelProcessor`] uses [`AutoImageProcessor`]. See [`SiglipImageProcessor.__call__`] for details ([]`NewTaskModelProcessor`] uses

View File

@ -360,7 +360,8 @@ class Qwen2AudioEncoder(Qwen2AudioPreTrainedModel):
input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`): input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
the soundfile library (`pip install soundfile`). To prepare the array into
`input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
attention_mask (`torch.Tensor`)`, *optional*): attention_mask (`torch.Tensor`)`, *optional*):
@ -740,9 +741,10 @@ class Qwen2AudioForConditionalGeneration(Qwen2AudioPreTrainedModel, GenerationMi
input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, feature_sequence_length)`): input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, feature_sequence_length)`):
Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via
the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
[`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the
tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] mel features, padding and conversion into a tensor of type `torch.FloatTensor`.
See [`~WhisperFeatureExtractor.__call__`]
feature_attention_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`): feature_attention_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`: Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:

View File

@ -1077,9 +1077,10 @@ class SEWForSequenceClassification(SEWPreTrainedModel):
r""" r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
conversion into a tensor of type `torch.FloatTensor`. See [`SEWProcessor.__call__`] for details. the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`SEWProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If

View File

@ -1597,9 +1597,10 @@ class SEWDForSequenceClassification(SEWDPreTrainedModel):
r""" r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
conversion into a tensor of type `torch.FloatTensor`. See [`SEWDProcessor.__call__`] for details. the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`SEWDProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If

View File

@ -86,8 +86,9 @@ SPEECH_ENCODER_DECODER_INPUTS_DOCSTRING = r"""
Args: Args:
inputs (`jnp.ndarray` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, feature_dim)`, *optional*): inputs (`jnp.ndarray` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, feature_dim)`, *optional*):
Float values of input raw speech waveform or speech features. Values can be obtained by loading a `.flac` Float values of input raw speech waveform or speech features. Values can be obtained by loading a `.flac`
or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip
library (`pip install soundfile`). To prepare the array into `inputs`, either the [`Wav2Vec2Processor`] or install torchcodec`) or the soundfile library (`pip install soundfile`).
To prepare the array into `inputs`, either the [`Wav2Vec2Processor`] or
[`Speech2TextProcessor`] should be used for padding and conversion into a tensor of type [`Speech2TextProcessor`] should be used for padding and conversion into a tensor of type
`torch.FloatTensor`. `torch.FloatTensor`.
attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*): attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):

View File

@ -339,8 +339,9 @@ class SpeechEncoderDecoderModel(PreTrainedModel, GenerationMixin):
r""" r"""
inputs (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, feature_dim)`, *optional*): inputs (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, feature_dim)`, *optional*):
Float values of input raw speech waveform or speech features. Values can be obtained by loading a `.flac` Float values of input raw speech waveform or speech features. Values can be obtained by loading a `.flac`
or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip
library (`pip install soundfile`). To prepare the array into `inputs`, either the [`Wav2Vec2Processor`] or install torchcodec`) or the soundfile library (`pip install soundfile`).
To prepare the array into `inputs`, either the [`Wav2Vec2Processor`] or
[`Speech2TextProcessor`] should be used for padding and conversion into a tensor of type [`Speech2TextProcessor`] should be used for padding and conversion into a tensor of type
`torch.FloatTensor`. `torch.FloatTensor`.
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):

View File

@ -620,7 +620,8 @@ class Speech2TextEncoder(Speech2TextPreTrainedModel):
input_features (`torch.LongTensor` of shape `(batch_size, sequence_length, feature_size)`): input_features (`torch.LongTensor` of shape `(batch_size, sequence_length, feature_size)`):
Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
the soundfile library (`pip install soundfile`). To prepare the array into
`input_features`, the [`AutoFeatureExtractor`] should be used for extracting the fbank features, `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the fbank features,
padding and conversion into a tensor of type `torch.FloatTensor`. See padding and conversion into a tensor of type `torch.FloatTensor`. See
[`~Speech2TextFeatureExtractor.__call__`] [`~Speech2TextFeatureExtractor.__call__`]

View File

@ -848,7 +848,8 @@ class TFSpeech2TextEncoder(keras.layers.Layer):
input_features (`tf.Tensor` of shape `(batch_size, sequence_length, feature_size)`): input_features (`tf.Tensor` of shape `(batch_size, sequence_length, feature_size)`):
Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
the soundfile library (`pip install soundfile`). To prepare the array into
`input_features`, the [`AutoFeatureExtractor`] should be used for extracting the fbank features, `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the fbank features,
padding and conversion into a tensor of floats. See [`~Speech2TextFeatureExtractor.__call__`] padding and conversion into a tensor of floats. See [`~Speech2TextFeatureExtractor.__call__`]
attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
@ -1469,7 +1470,7 @@ class TFSpeech2TextForConditionalGeneration(TFSpeech2TextPreTrainedModel, TFCaus
>>> import tensorflow as tf >>> import tensorflow as tf
>>> from transformers import Speech2TextProcessor, TFSpeech2TextForConditionalGeneration >>> from transformers import Speech2TextProcessor, TFSpeech2TextForConditionalGeneration
>>> from datasets import load_dataset >>> from datasets import load_dataset
>>> import soundfile as sf >>> from torchcodec.decoders import AudioDecoder
>>> model = TFSpeech2TextForConditionalGeneration.from_pretrained( >>> model = TFSpeech2TextForConditionalGeneration.from_pretrained(
... "facebook/s2t-small-librispeech-asr", from_pt=True ... "facebook/s2t-small-librispeech-asr", from_pt=True
@ -1478,8 +1479,8 @@ class TFSpeech2TextForConditionalGeneration(TFSpeech2TextPreTrainedModel, TFCaus
>>> def map_to_array(batch): >>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"]) ... decoder = AudioDecoder(batch["file"])
... batch["speech"] = speech ... batch["speech"] = decoder.get_all_samples().data
... return batch ... return batch

View File

@ -1486,9 +1486,10 @@ class UniSpeechForSequenceClassification(UniSpeechPreTrainedModel):
r""" r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
conversion into a tensor of type `torch.FloatTensor`. See [`UniSpeechProcessor.__call__`] for details. the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`UniSpeechProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If

View File

@ -1481,9 +1481,10 @@ class UniSpeechSatForSequenceClassification(UniSpeechSatPreTrainedModel):
r""" r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
conversion into a tensor of type `torch.FloatTensor`. See [`UniSpeechSatProcessor.__call__`] for details. the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`UniSpeechSatProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@ -1595,9 +1596,10 @@ class UniSpeechSatForAudioFrameClassification(UniSpeechSatPreTrainedModel):
r""" r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
conversion into a tensor of type `torch.FloatTensor`. See [`UniSpeechSatProcessor.__call__`] for details. the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`UniSpeechSatProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@ -1777,9 +1779,10 @@ class UniSpeechSatForXVector(UniSpeechSatPreTrainedModel):
r""" r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
conversion into a tensor of type `torch.FloatTensor`. See [`UniSpeechSatProcessor.__call__`] for details. the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`UniSpeechSatProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If

View File

@ -1064,15 +1064,15 @@ FLAX_WAV2VEC2_MODEL_DOCSTRING = """
```python ```python
>>> from transformers import AutoProcessor, FlaxWav2Vec2Model >>> from transformers import AutoProcessor, FlaxWav2Vec2Model
>>> from datasets import load_dataset >>> from datasets import load_dataset
>>> import soundfile as sf >>> from torchcodec.decoders import AudioDecoder
>>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-large-lv60") >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-large-lv60")
>>> model = FlaxWav2Vec2Model.from_pretrained("facebook/wav2vec2-large-lv60") >>> model = FlaxWav2Vec2Model.from_pretrained("facebook/wav2vec2-large-lv60")
>>> def map_to_array(batch): >>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"]) ... decoder = AudioDecoder(batch["file"])
... batch["speech"] = speech ... batch["speech"] = decoder.get_all_samples().data
... return batch ... return batch
@ -1183,15 +1183,15 @@ FLAX_WAV2VEC2_FOR_CTC_DOCSTRING = """
>>> import jax.numpy as jnp >>> import jax.numpy as jnp
>>> from transformers import AutoProcessor, FlaxWav2Vec2ForCTC >>> from transformers import AutoProcessor, FlaxWav2Vec2ForCTC
>>> from datasets import load_dataset >>> from datasets import load_dataset
>>> import soundfile as sf >>> from torchcodec.decoders import AudioDecoder
>>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-large-960h-lv60") >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-large-960h-lv60")
>>> model = FlaxWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60") >>> model = FlaxWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60")
>>> def map_to_array(batch): >>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"]) ... decoder = AudioDecoder(batch["file"])
... batch["speech"] = speech ... batch["speech"] = decoder.get_all_samples().data
... return batch ... return batch
@ -1384,15 +1384,15 @@ FLAX_WAV2VEC2_FOR_PRETRAINING_DOCSTRING = """
>>> from transformers import AutoFeatureExtractor, FlaxWav2Vec2ForPreTraining >>> from transformers import AutoFeatureExtractor, FlaxWav2Vec2ForPreTraining
>>> from transformers.models.wav2vec2.modeling_flax_wav2vec2 import _compute_mask_indices >>> from transformers.models.wav2vec2.modeling_flax_wav2vec2 import _compute_mask_indices
>>> from datasets import load_dataset >>> from datasets import load_dataset
>>> import soundfile as sf >>> from torchcodec.decoders import AudioDecoder
>>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-large-lv60") >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-large-lv60")
>>> model = FlaxWav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-large-lv60") >>> model = FlaxWav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-large-lv60")
>>> def map_to_array(batch): >>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"]) ... decoder = AudioDecoder(batch["file"])
... batch["speech"] = speech ... batch["speech"] = decoder.get_all_samples().data
... return batch ... return batch

View File

@ -1530,15 +1530,15 @@ class TFWav2Vec2Model(TFWav2Vec2PreTrainedModel):
```python ```python
>>> from transformers import AutoProcessor, TFWav2Vec2Model >>> from transformers import AutoProcessor, TFWav2Vec2Model
>>> from datasets import load_dataset >>> from datasets import load_dataset
>>> import soundfile as sf >>> from torchcodec.decoders import AudioDecoder
>>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
>>> model = TFWav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h") >>> model = TFWav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
>>> def map_to_array(batch): >>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"]) ... decoder = AudioDecoder(batch["file"])
... batch["speech"] = speech ... batch["speech"] = decoder.get_all_samples().data
... return batch ... return batch
@ -1642,15 +1642,15 @@ class TFWav2Vec2ForCTC(TFWav2Vec2PreTrainedModel):
>>> import tensorflow as tf >>> import tensorflow as tf
>>> from transformers import AutoProcessor, TFWav2Vec2ForCTC >>> from transformers import AutoProcessor, TFWav2Vec2ForCTC
>>> from datasets import load_dataset >>> from datasets import load_dataset
>>> import soundfile as sf >>> from torchcodec.decoders import AudioDecoder
>>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
>>> model = TFWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") >>> model = TFWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
>>> def map_to_array(batch): >>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"]) ... decoder = AudioDecoder(batch["file"])
... batch["speech"] = speech ... batch["speech"] = decoder.get_all_samples().data
... return batch ... return batch

View File

@ -2012,9 +2012,10 @@ class Wav2Vec2ForSequenceClassification(Wav2Vec2PreTrainedModel):
r""" r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details. the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`Wav2Vec2Processor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@ -2126,9 +2127,10 @@ class Wav2Vec2ForAudioFrameClassification(Wav2Vec2PreTrainedModel):
r""" r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details. the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`Wav2Vec2Processor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@ -2308,9 +2310,10 @@ class Wav2Vec2ForXVector(Wav2Vec2PreTrainedModel):
r""" r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details. the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`Wav2Vec2Processor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If

View File

@ -1579,9 +1579,10 @@ class Wav2Vec2ConformerForSequenceClassification(Wav2Vec2ConformerPreTrainedMode
r""" r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2ConformerProcessor.__call__`] for details. the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`Wav2Vec2ConformerProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@ -1681,9 +1682,10 @@ class Wav2Vec2ConformerForAudioFrameClassification(Wav2Vec2ConformerPreTrainedMo
r""" r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2ConformerProcessor.__call__`] for details. the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`Wav2Vec2ConformerProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@ -1851,9 +1853,10 @@ class Wav2Vec2ConformerForXVector(Wav2Vec2ConformerPreTrainedModel):
r""" r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2ConformerProcessor.__call__`] for details. the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`Wav2Vec2ConformerProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If

View File

@ -1328,9 +1328,10 @@ class WavLMForSequenceClassification(WavLMPreTrainedModel):
r""" r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
conversion into a tensor of type `torch.FloatTensor`. See [`WavLMProcessor.__call__`] for details. the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`WavLMProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@ -1442,9 +1443,10 @@ class WavLMForAudioFrameClassification(WavLMPreTrainedModel):
r""" r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
conversion into a tensor of type `torch.FloatTensor`. See [`WavLMProcessor.__call__`] for details. the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`WavLMProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
@ -1624,9 +1626,10 @@ class WavLMForXVector(WavLMPreTrainedModel):
r""" r"""
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
conversion into a tensor of type `torch.FloatTensor`. See [`WavLMProcessor.__call__`] for details. the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`WavLMProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If

View File

@ -102,9 +102,10 @@ WHISPER_INPUTS_DOCSTRING = r"""
input_features (`numpy.ndarray` of shape `(batch_size, feature_size, sequence_length)`): input_features (`numpy.ndarray` of shape `(batch_size, feature_size, sequence_length)`):
Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via
the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
[`WhisperFeatureExtractor`] should be used for extracting the features, padding and conversion into a To prepare the array into `input_features`, the [`WhisperFeatureExtractor`] should be used for extracting
tensor of type `numpy.ndarray`. See [`~WhisperFeatureExtractor.__call__`] the features, padding and conversion into a tensor of type `numpy.ndarray`.
See [`~WhisperFeatureExtractor.__call__`]
attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Whisper does not support masking of the `input_features`, this argument is preserved for compatibility, but Whisper does not support masking of the `input_features`, this argument is preserved for compatibility, but
is not used. By default the silence in the input log mel spectrogram are ignored. is not used. By default the silence in the input log mel spectrogram are ignored.
@ -139,9 +140,10 @@ WHISPER_ENCODE_INPUTS_DOCSTRING = r"""
input_features (`numpy.ndarray` of shape `(batch_size, feature_size, sequence_length)`): input_features (`numpy.ndarray` of shape `(batch_size, feature_size, sequence_length)`):
Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via
the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
[`WhisperFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a To prepare the array into `input_features`, the [`WhisperFeatureExtractor`] should be used for extracting
tensor of type `numpy.ndarray`. See [`~WhisperFeatureExtractor.__call__`]. the mel features, padding and conversion into a tensor of type `numpy.ndarray`.
See [`~WhisperFeatureExtractor.__call__`].
attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
Whisper does not support masking of the `input_features`, this argument is preserved for compatibility, but Whisper does not support masking of the `input_features`, this argument is preserved for compatibility, but
is not used. By default the silence in the input log mel spectrogram are ignored. is not used. By default the silence in the input log mel spectrogram are ignored.

View File

@ -601,9 +601,10 @@ WHISPER_INPUTS_DOCSTRING = r"""
input_features (`tf.Tensor` of shape `(batch_size, feature_size, sequence_length)`): input_features (`tf.Tensor` of shape `(batch_size, feature_size, sequence_length)`):
Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be obtained Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be obtained
by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.*
via the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the via the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
[`AutoFeatureExtractor`] should be used for extracting the fbank features, padding and conversion into a To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the
tensor of type `tf.Tensor`. See [`~WhisperFeatureExtractor.__call__`] fbank features, padding and conversion into a tensor of type `tf.Tensor`.
See [`~WhisperFeatureExtractor.__call__`]
decoder_input_ids (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*): decoder_input_ids (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary. Indices of decoder input sequence tokens in the vocabulary.
@ -729,7 +730,8 @@ class TFWhisperEncoder(keras.layers.Layer):
input_features (`tf.Tensor` of shape `(batch_size, feature_size, sequence_length)`): input_features (`tf.Tensor` of shape `(batch_size, feature_size, sequence_length)`):
Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
the soundfile library (`pip install soundfile`). To prepare the array into
`input_features`, the [`AutoFeatureExtractor`] should be used for extracting the fbank features, `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the fbank features,
padding and conversion into a tensor of type `tf.Tensor`. See [`~WhisperFeatureExtractor.__call__`] padding and conversion into a tensor of type `tf.Tensor`. See [`~WhisperFeatureExtractor.__call__`]
head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):

View File

@ -651,7 +651,8 @@ class WhisperEncoder(WhisperPreTrainedModel):
input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`): input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
`numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into `numpy.ndarray`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
the soundfile library (`pip install soundfile`). To prepare the array into
`input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
attention_mask (`torch.Tensor`)`, *optional*): attention_mask (`torch.Tensor`)`, *optional*):
@ -1096,9 +1097,10 @@ class WhisperModel(WhisperPreTrainedModel):
input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`): input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`):
Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via
the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
[`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the
tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] mel features, padding and conversion into a tensor of type `torch.FloatTensor`.
See [`~WhisperFeatureExtractor.__call__`]
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary. Indices of decoder input sequence tokens in the vocabulary.
@ -1266,9 +1268,10 @@ class WhisperForConditionalGeneration(WhisperGenerationMixin, WhisperPreTrainedM
input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`): input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`):
Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via
the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
[`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the
tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] mel features, padding and conversion into a tensor of type `torch.FloatTensor`.
See [`~WhisperFeatureExtractor.__call__`]
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary. Indices of decoder input sequence tokens in the vocabulary.
@ -1600,9 +1603,10 @@ class WhisperForAudioClassification(WhisperPreTrainedModel):
input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`): input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`):
Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via
the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
[`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the
tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] mel features, padding and conversion into a tensor of type `torch.FloatTensor`.
See [`~WhisperFeatureExtractor.__call__`]
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If

View File

@ -130,7 +130,6 @@ from .utils import (
is_scipy_available, is_scipy_available,
is_sentencepiece_available, is_sentencepiece_available,
is_seqio_available, is_seqio_available,
is_soundfile_available,
is_spacy_available, is_spacy_available,
is_speech_available, is_speech_available,
is_spqr_available, is_spqr_available,
@ -656,7 +655,7 @@ def require_torchcodec(test_case):
These tests are skipped when Torchcodec isn't installed. These tests are skipped when Torchcodec isn't installed.
""" """
return unittest.skipUnless(is_torchcodec_available(), "test requires Torchvision")(test_case) return unittest.skipUnless(is_torchcodec_available(), "test requires Torchcodec")(test_case)
def require_torch_or_tf(test_case): def require_torch_or_tf(test_case):
@ -1268,16 +1267,6 @@ def require_clearml(test_case):
return unittest.skipUnless(is_clearml_available(), "test requires clearml")(test_case) return unittest.skipUnless(is_clearml_available(), "test requires clearml")(test_case)
def require_soundfile(test_case):
"""
Decorator marking a test that requires soundfile
These tests are skipped when soundfile isn't installed.
"""
return unittest.skipUnless(is_soundfile_available(), "test requires soundfile")(test_case)
def require_deepspeed(test_case): def require_deepspeed(test_case):
""" """
Decorator marking a test that requires deepspeed Decorator marking a test that requires deepspeed

View File

@ -248,9 +248,10 @@ class ModelArgs:
input_values = { input_values = {
"description": """ "description": """
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the torchcodec library (`pip install
soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and torchcodec`) or the soundfile library (`pip install soundfile`). To prepare the array into `input_values`,
conversion into a tensor of type `torch.FloatTensor`. See [`{processor_class}.__call__`] for details. the [`AutoProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`.
See [`{processor_class}.__call__`] for details.
""", """,
"shape": "of shape `(batch_size, sequence_length)`", "shape": "of shape `(batch_size, sequence_length)`",
} }

View File

@ -21,7 +21,7 @@ from datasets import load_dataset
from tests.test_modeling_common import floats_tensor, ids_tensor, random_attention_mask from tests.test_modeling_common import floats_tensor, ids_tensor, random_attention_mask
from transformers import Data2VecAudioConfig, is_torch_available from transformers import Data2VecAudioConfig, is_torch_available
from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device from transformers.testing_utils import require_torch, require_torchcodec, slow, torch_device
from ...test_configuration_common import ConfigTester from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, _config_zero_init from ...test_modeling_common import ModelTesterMixin, _config_zero_init
@ -656,7 +656,7 @@ class Data2VecAudioUtilsTest(unittest.TestCase):
@require_torch @require_torch
@require_soundfile @require_torchcodec
@slow @slow
class Data2VecAudioModelIntegrationTest(unittest.TestCase): class Data2VecAudioModelIntegrationTest(unittest.TestCase):
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):

View File

@ -22,7 +22,7 @@ import unittest
import pytest import pytest
from transformers import HubertConfig, is_torch_available from transformers import HubertConfig, is_torch_available
from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device from transformers.testing_utils import require_torch, require_torchcodec, slow, torch_device
from ...test_configuration_common import ConfigTester from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ( from ...test_modeling_common import (
@ -750,7 +750,7 @@ class HubertUtilsTest(unittest.TestCase):
@require_torch @require_torch
@require_soundfile @require_torchcodec
@slow @slow
class HubertModelIntegrationTest(unittest.TestCase): class HubertModelIntegrationTest(unittest.TestCase):
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):

View File

@ -33,13 +33,13 @@ from transformers import (
from transformers.testing_utils import ( from transformers.testing_utils import (
Expectations, Expectations,
cleanup, cleanup,
require_soundfile,
require_torch, require_torch,
require_torch_large_accelerator, require_torch_large_accelerator,
require_torchcodec,
slow, slow,
torch_device, torch_device,
) )
from transformers.utils import is_soundfile_available from transformers.utils import is_torchcodec_available
from ...generation.test_utils import GenerationTesterMixin from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester from ...test_configuration_common import ConfigTester
@ -54,8 +54,8 @@ if is_vision_available():
from PIL import Image from PIL import Image
if is_soundfile_available(): if is_torchcodec_available():
import soundfile import torchcodec
class Phi4MultimodalModelTester: class Phi4MultimodalModelTester:
@ -300,7 +300,8 @@ class Phi4MultimodalIntegrationTest(unittest.TestCase):
tmp.write(requests.get(self.audio_url, stream=True).raw.data) tmp.write(requests.get(self.audio_url, stream=True).raw.data)
tmp.flush() tmp.flush()
tmp.seek(0) tmp.seek(0)
self.audio, self.sampling_rate = soundfile.read(tmp.name) samples = torchcodec.decoders.AudioDecoder(tmp.name).get_all_samples()
self.audio, self.sampling_rate = samples.data, samples.sample_rate
cleanup(torch_device, gc_collect=True) cleanup(torch_device, gc_collect=True)
@ -378,7 +379,7 @@ class Phi4MultimodalIntegrationTest(unittest.TestCase):
self.assertEqual(response, EXPECTED_RESPONSE) self.assertEqual(response, EXPECTED_RESPONSE)
@require_soundfile @require_torchcodec
def test_audio_text_generation(self): def test_audio_text_generation(self):
model = AutoModelForCausalLM.from_pretrained( model = AutoModelForCausalLM.from_pretrained(
self.checkpoint_path, revision=self.revision, torch_dtype=torch.float16, device_map=torch_device self.checkpoint_path, revision=self.revision, torch_dtype=torch.float16, device_map=torch_device

View File

@ -19,7 +19,7 @@ import unittest
import pytest import pytest
from transformers import SEWConfig, is_torch_available from transformers import SEWConfig, is_torch_available
from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device from transformers.testing_utils import require_torch, require_torchcodec, slow, torch_device
from ...test_configuration_common import ConfigTester from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ( from ...test_modeling_common import (
@ -453,7 +453,7 @@ class SEWUtilsTest(unittest.TestCase):
@require_torch @require_torch
@require_soundfile @require_torchcodec
@slow @slow
class SEWModelIntegrationTest(unittest.TestCase): class SEWModelIntegrationTest(unittest.TestCase):
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):

View File

@ -19,7 +19,7 @@ import unittest
import pytest import pytest
from transformers import SEWDConfig, is_torch_available from transformers import SEWDConfig, is_torch_available
from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device from transformers.testing_utils import require_torch, require_torchcodec, slow, torch_device
from ...test_configuration_common import ConfigTester from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ( from ...test_modeling_common import (
@ -464,7 +464,7 @@ class SEWDUtilsTest(unittest.TestCase):
@require_torch @require_torch
@require_soundfile @require_torchcodec
@slow @slow
class SEWDModelIntegrationTest(unittest.TestCase): class SEWDModelIntegrationTest(unittest.TestCase):
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):

View File

@ -21,7 +21,7 @@ import pytest
from datasets import load_dataset from datasets import load_dataset
from transformers import UniSpeechConfig, is_torch_available from transformers import UniSpeechConfig, is_torch_available
from transformers.testing_utils import is_flaky, require_soundfile, require_torch, slow, torch_device from transformers.testing_utils import is_flaky, require_torch, require_torchcodec, slow, torch_device
from ...test_configuration_common import ConfigTester from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ( from ...test_modeling_common import (
@ -553,7 +553,7 @@ class UniSpeechRobustModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.T
@require_torch @require_torch
@require_soundfile @require_torchcodec
@slow @slow
class UniSpeechModelIntegrationTest(unittest.TestCase): class UniSpeechModelIntegrationTest(unittest.TestCase):
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):

View File

@ -21,7 +21,7 @@ import pytest
from datasets import load_dataset from datasets import load_dataset
from transformers import UniSpeechSatConfig, is_torch_available from transformers import UniSpeechSatConfig, is_torch_available
from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device from transformers.testing_utils import require_torch, require_torchcodec, slow, torch_device
from ...test_configuration_common import ConfigTester from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ( from ...test_modeling_common import (
@ -807,7 +807,7 @@ class UniSpeechSatRobustModelTest(ModelTesterMixin, unittest.TestCase):
@require_torch @require_torch
@require_soundfile @require_torchcodec
@slow @slow
class UniSpeechSatModelIntegrationTest(unittest.TestCase): class UniSpeechSatModelIntegrationTest(unittest.TestCase):
def _load_datasamples(self, num_samples): def _load_datasamples(self, num_samples):

View File

@ -34,10 +34,10 @@ from transformers.testing_utils import (
is_torchaudio_available, is_torchaudio_available,
require_flash_attn, require_flash_attn,
require_pyctcdecode, require_pyctcdecode,
require_soundfile,
require_torch, require_torch,
require_torch_gpu, require_torch_gpu,
require_torchaudio, require_torchaudio,
require_torchcodec,
run_test_in_subprocess, run_test_in_subprocess,
slow, slow,
torch_device, torch_device,
@ -1444,7 +1444,7 @@ class Wav2Vec2UtilsTest(unittest.TestCase):
@require_torch @require_torch
@require_soundfile @require_torchcodec
@slow @slow
class Wav2Vec2ModelIntegrationTest(unittest.TestCase): class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
def tearDown(self): def tearDown(self):