Files
transformers/tests/models/gemma3n/test_feature_extraction_gemma3n.py
Yuanyuan Chen 9e99198e5e Use | for Optional and Union typing (#41646)
Signed-off-by: Yuanyuan Chen <cyyever@outlook.com>
2025-10-16 14:29:54 +00:00

299 lines
13 KiB
Python

# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import itertools
import os
import random
import tempfile
import unittest
from collections.abc import Sequence
import numpy as np
from parameterized import parameterized
from transformers.models.gemma3n import Gemma3nAudioFeatureExtractor
from transformers.testing_utils import (
check_json_file_has_correct_format,
require_torch,
)
from transformers.utils.import_utils import is_torch_available
from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
if is_torch_available():
pass
global_rng = random.Random()
MAX_LENGTH_FOR_TESTING = 512
def floats_list(shape, scale=1.0, rng=None):
"""Creates a random float32 tensor"""
if rng is None:
rng = global_rng
values = []
for _ in range(shape[0]):
values.append([])
for _ in range(shape[1]):
values[-1].append(rng.random() * scale)
return values
class Gemma3nAudioFeatureExtractionTester:
def __init__(
self,
parent,
batch_size=7,
min_seq_length=400,
max_seq_length=2000,
feature_size: int = 128,
sampling_rate: int = 16_000,
padding_value: float = 0.0,
return_attention_mask: bool = False,
# ignore hop_length / frame_length for now, as ms -> length conversion causes issues with serialization tests
# frame_length_ms: float = 32.0,
# hop_length: float = 10.0,
min_frequency: float = 125.0,
max_frequency: float = 7600.0,
preemphasis: float = 0.97,
preemphasis_htk_flavor: bool = True,
fft_overdrive: bool = True,
dither: float = 0.0,
input_scale_factor: float = 1.0,
mel_floor: float = 1e-5,
per_bin_mean: Sequence[float] | None = None,
per_bin_stddev: Sequence[float] | None = None,
):
self.parent = parent
self.batch_size = batch_size
self.min_seq_length = min_seq_length
self.max_seq_length = max_seq_length
self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
self.feature_size = feature_size
self.sampling_rate = sampling_rate
self.padding_value = padding_value
self.return_attention_mask = return_attention_mask
# ignore hop_length / frame_length for now, as ms -> length conversion causes issues with serialization tests
# self.frame_length_ms = frame_length_ms
# self.hop_length = hop_length
self.min_frequency = min_frequency
self.max_frequency = max_frequency
self.preemphasis = preemphasis
self.preemphasis_htk_flavor = preemphasis_htk_flavor
self.fft_overdrive = fft_overdrive
self.dither = dither
self.input_scale_factor = input_scale_factor
self.mel_floor = mel_floor
self.per_bin_mean = per_bin_mean
self.per_bin_stddev = per_bin_stddev
def prepare_feat_extract_dict(self):
return {
"feature_size": self.feature_size,
"sampling_rate": self.sampling_rate,
"padding_value": self.padding_value,
"return_attention_mask": self.return_attention_mask,
"min_frequency": self.min_frequency,
"max_frequency": self.max_frequency,
"preemphasis": self.preemphasis,
"preemphasis_htk_flavor": self.preemphasis_htk_flavor,
"fft_overdrive": self.fft_overdrive,
"dither": self.dither,
"input_scale_factor": self.input_scale_factor,
"mel_floor": self.mel_floor,
"per_bin_mean": self.per_bin_mean,
"per_bin_stddev": self.per_bin_stddev,
}
def prepare_inputs_for_common(self, equal_length=False, numpify=False):
def _flatten(list_of_lists):
return list(itertools.chain(*list_of_lists))
if equal_length:
speech_inputs = [floats_list((self.max_seq_length, self.feature_size)) for _ in range(self.batch_size)]
else:
# make sure that inputs increase in size
speech_inputs = [
floats_list((x, self.feature_size))
for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
]
if numpify:
speech_inputs = [np.asarray(x) for x in speech_inputs]
return speech_inputs
class Gemma3nAudioFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
feature_extraction_class = Gemma3nAudioFeatureExtractor
def setUp(self):
self.feat_extract_tester = Gemma3nAudioFeatureExtractionTester(self)
def test_feat_extract_from_and_save_pretrained(self):
feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
with tempfile.TemporaryDirectory() as tmpdirname:
saved_file = feat_extract_first.save_pretrained(tmpdirname)[0]
check_json_file_has_correct_format(saved_file)
feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname)
dict_first = feat_extract_first.to_dict()
dict_second = feat_extract_second.to_dict()
mel_1 = feat_extract_first.mel_filters
mel_2 = feat_extract_second.mel_filters
self.assertTrue(np.allclose(mel_1, mel_2))
self.assertEqual(dict_first, dict_second)
def test_feat_extract_to_json_file(self):
feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
with tempfile.TemporaryDirectory() as tmpdirname:
json_file_path = os.path.join(tmpdirname, "feat_extract.json")
feat_extract_first.to_json_file(json_file_path)
feat_extract_second = self.feature_extraction_class.from_json_file(json_file_path)
dict_first = feat_extract_first.to_dict()
dict_second = feat_extract_second.to_dict()
mel_1 = feat_extract_first.mel_filters
mel_2 = feat_extract_second.mel_filters
self.assertTrue(np.allclose(mel_1, mel_2))
self.assertEqual(dict_first, dict_second)
def test_feat_extract_from_pretrained_kwargs(self):
feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
with tempfile.TemporaryDirectory() as tmpdirname:
saved_file = feat_extract_first.save_pretrained(tmpdirname)[0]
check_json_file_has_correct_format(saved_file)
feat_extract_second = self.feature_extraction_class.from_pretrained(
tmpdirname, feature_size=2 * self.feat_extract_dict["feature_size"]
)
mel_1 = feat_extract_first.mel_filters
mel_2 = feat_extract_second.mel_filters
self.assertTrue(2 * mel_1.shape[1] == mel_2.shape[1])
@parameterized.expand(
[
([floats_list((1, x))[0] for x in range(800, 1400, 200)],),
([floats_list((1, x))[0] for x in (800, 800, 800)],),
([floats_list((1, x))[0] for x in range(200, (MAX_LENGTH_FOR_TESTING + 500), 200)], True),
]
)
def test_call(self, audio_inputs, test_truncation=False):
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
np_audio_inputs = [np.asarray(audio_input) for audio_input in audio_inputs]
input_features = feature_extractor(np_audio_inputs, padding="max_length", return_tensors="np").input_features
self.assertTrue(input_features.ndim == 3)
# input_features.shape should be (batch, num_frames, n_mels) ~= (batch, num_frames, feature_size)
# 480_000 is the max_length that inputs are padded to. we use that to calculate num_frames
expected_num_frames = (480_000 - feature_extractor.frame_length) // (feature_extractor.hop_length) + 1
self.assertTrue(
input_features.shape[-2] == expected_num_frames,
f"no match: {input_features.shape[-1]} vs {expected_num_frames}",
)
self.assertTrue(input_features.shape[-1] == feature_extractor.feature_size)
encoded_sequences_1 = feature_extractor(audio_inputs, return_tensors="np").input_features
encoded_sequences_2 = feature_extractor(np_audio_inputs, return_tensors="np").input_features
for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
if test_truncation:
audio_inputs_truncated = [x[:MAX_LENGTH_FOR_TESTING] for x in audio_inputs]
np_audio_inputs_truncated = [np.asarray(audio_input) for audio_input in audio_inputs_truncated]
encoded_sequences_1 = feature_extractor(
audio_inputs_truncated, max_length=MAX_LENGTH_FOR_TESTING, return_tensors="np"
).input_features
encoded_sequences_2 = feature_extractor(
np_audio_inputs_truncated, max_length=MAX_LENGTH_FOR_TESTING, return_tensors="np"
).input_features
for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
def test_audio_features_attn_mask_consistent(self):
# regression test for https://github.com/huggingface/transformers/issues/39911
# Test input_features and input_features_mask have consistent shape
np.random.seed(42)
feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
for i in [512, 640, 1024]:
audio = np.random.randn(i)
mm_data = {
"raw_speech": [audio],
"sampling_rate": 16000,
}
inputs = feature_extractor(**mm_data, return_tensors="np")
out = inputs["input_features"]
mask = inputs["input_features_mask"]
assert out.ndim == 3
assert mask.ndim == 2
assert out.shape[:2] == mask.shape[:2]
def test_dither(self):
np.random.seed(42) # seed the dithering randn()
# Tests that features with and without little dithering are similar, but not the same
dict_no_dither = self.feat_extract_tester.prepare_feat_extract_dict()
dict_no_dither["dither"] = 0.0
dict_dither = self.feat_extract_tester.prepare_feat_extract_dict()
dict_dither["dither"] = 0.00003 # approx. 1/32k
feature_extractor_no_dither = self.feature_extraction_class(**dict_no_dither)
feature_extractor_dither = self.feature_extraction_class(**dict_dither)
# create three inputs of length 800, 1000, and 1200
speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
# compute features
input_features_no_dither = feature_extractor_no_dither(
np_speech_inputs, padding=True, return_tensors="np", sampling_rate=dict_no_dither["sampling_rate"]
).input_features
input_features_dither = feature_extractor_dither(
np_speech_inputs, padding=True, return_tensors="np", sampling_rate=dict_dither["sampling_rate"]
).input_features
# test there is a difference between features (there's added noise to input signal)
diff = input_features_dither - input_features_no_dither
# features are not identical
assert np.abs(diff).mean() > 1e-6
# features are not too different
# the heuristic value `7e-4` is obtained by running 50000 times (maximal value is around 3e-4).
assert np.abs(diff).mean() < 7e-4
# the heuristic value `8e-1` is obtained by running 50000 times (maximal value is around 5e-1).
assert np.abs(diff).max() < 8e-1
@require_torch
def test_double_precision_pad(self):
import torch
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
np_speech_inputs = np.random.rand(100, 32).astype(np.float64)
py_speech_inputs = np_speech_inputs.tolist()
for inputs in [py_speech_inputs, np_speech_inputs]:
np_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="np")
self.assertTrue(np_processed.input_features.dtype == np.float32)
pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="pt")
self.assertTrue(pt_processed.input_features.dtype == torch.float32)