mirror of
https://github.com/huggingface/transformers.git
synced 2025-10-20 17:13:56 +08:00
extend gemma3n integration ut cases on XPU (#41071)
Signed-off-by: Yao, Matrix <matrix.yao@intel.com>
This commit is contained in:
@ -37,10 +37,12 @@ from transformers import (
|
||||
is_torch_available,
|
||||
)
|
||||
from transformers.testing_utils import (
|
||||
Expectations,
|
||||
cleanup,
|
||||
require_deterministic_for_xpu,
|
||||
require_read_token,
|
||||
require_torch,
|
||||
require_torch_gpu,
|
||||
require_torch_accelerator,
|
||||
set_config_for_less_flaky_test,
|
||||
set_model_for_less_flaky_test,
|
||||
slow,
|
||||
@ -217,8 +219,6 @@ class Gemma3nAudioModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
self.assertEqual(input_features.shape, self.expected_input_features_shape)
|
||||
np.testing.assert_allclose(input_features[0, 0, :5], self.expected_input_features_slice, rtol=1e-5, atol=1e-5)
|
||||
|
||||
print(input_features[0, 0, :5])
|
||||
|
||||
input_features_mask = audio_inputs["input_features_mask"]
|
||||
self.assertEqual(input_features_mask.shape, self.expected_input_features_mask_shape)
|
||||
# The second audio sample is shorter (22 frames vs 48), so its mask should become False at index 22
|
||||
@ -235,8 +235,6 @@ class Gemma3nAudioModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
with torch.no_grad():
|
||||
encoder_output, encoder_mask = model(**inputs_dict)
|
||||
|
||||
print(encoder_output[0, 0, :5])
|
||||
|
||||
# Check output encodings
|
||||
self.assertEqual(encoder_output.shape, self.expected_encoder_output_shape)
|
||||
torch.testing.assert_close(
|
||||
@ -745,7 +743,7 @@ class Gemma3nVision2TextModelTest(ModelTesterMixin, GenerationTesterMixin, unitt
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch_gpu
|
||||
@require_torch_accelerator
|
||||
@require_read_token
|
||||
class Gemma3nIntegrationTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
@ -766,7 +764,7 @@ class Gemma3nIntegrationTest(unittest.TestCase):
|
||||
audio_ds = load_dataset(
|
||||
"etechgrid/28.5k_wavfiles_dataset", "default", data_files="wav_dataset/103-1240-0000.wav"
|
||||
)
|
||||
self.audio_file_path = audio_ds["train"][0]["audio"].metadata.path
|
||||
self.audio_file_path = audio_ds["train"][0]["audio"]["path"]
|
||||
cleanup(torch_device, gc_collect=True)
|
||||
|
||||
def tearDown(self):
|
||||
@ -869,7 +867,17 @@ class Gemma3nIntegrationTest(unittest.TestCase):
|
||||
output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
|
||||
output_text = self.processor.batch_decode(output, skip_special_tokens=True)
|
||||
|
||||
EXPECTED_TEXTS = ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach next to a clear blue ocean. The cow is facing the viewer with its head slightly', "user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, the images are not identical. \n\nHere's a breakdown of the differences:\n\n* **Subject:** The first image features a cow"] # fmt: skip
|
||||
# fmt: off
|
||||
EXPECTATIONS = Expectations(
|
||||
{
|
||||
("cuda", None): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach next to a clear blue ocean. The cow is facing the viewer with its head slightly', "user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, the images are not identical. \n\nHere's a breakdown of the differences:\n\n* **Subject:** The first image features a cow"],
|
||||
("xpu", None): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach next to a turquoise ocean. The cow is facing the viewer with its head slightly turned', "user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, the images are not identical. \n\nHere's a breakdown of the differences:\n\n* **Subject:** The first image features a cow"],
|
||||
}
|
||||
)
|
||||
|
||||
EXPECTED_TEXTS = EXPECTATIONS.get_expectation()
|
||||
# fmt: on
|
||||
|
||||
self.assertEqual(output_text, EXPECTED_TEXTS)
|
||||
|
||||
def test_model_4b_image(self):
|
||||
@ -891,10 +899,22 @@ class Gemma3nIntegrationTest(unittest.TestCase):
|
||||
output_text = self.processor.batch_decode(output, skip_special_tokens=True)
|
||||
|
||||
EXPECTED_NUM_IMAGES = 1 # Gemma3n does not support crops
|
||||
EXPECTED_TEXTS = ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach next to a clear blue ocean. The cow is facing the viewer with its head slightly'] # fmt: skip
|
||||
|
||||
# fmt: off
|
||||
EXPECTATIONS = Expectations(
|
||||
{
|
||||
("cuda", None): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach next to a clear blue ocean. The cow is facing the viewer with its head slightly'],
|
||||
("xpu", None): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach next to a clear blue ocean. The cow is facing the viewer with its head slightly'],
|
||||
}
|
||||
)
|
||||
|
||||
EXPECTED_TEXTS = EXPECTATIONS.get_expectation()
|
||||
# fmt: on
|
||||
|
||||
self.assertEqual(len(inputs["pixel_values"]), EXPECTED_NUM_IMAGES)
|
||||
self.assertEqual(output_text, EXPECTED_TEXTS)
|
||||
|
||||
@require_deterministic_for_xpu
|
||||
def test_model_4b_multiimage(self):
|
||||
model_id = "Google/gemma-3n-E4B-it"
|
||||
|
||||
@ -928,7 +948,17 @@ class Gemma3nIntegrationTest(unittest.TestCase):
|
||||
output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
|
||||
output_text = self.processor.batch_decode(output, skip_special_tokens=True)
|
||||
|
||||
EXPECTED_TEXTS = ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nIn the image, I see a street scene in what appears to be a Chinatown district. Here are some key elements:\n\n* **A prominent red'] # fmt: skip
|
||||
# fmt: off
|
||||
EXPECTATIONS = Expectations(
|
||||
{
|
||||
("cuda", None): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nIn the image, I see a street scene in what appears to be a Chinatown district. Here are some key elements:\n\n* **A prominent red'],
|
||||
("xpu", None): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nIn the image, I see a street scene in what appears to be a Chinatown district. Here are the key elements:\n\n* **A prominent red'],
|
||||
}
|
||||
)
|
||||
|
||||
EXPECTED_TEXTS = EXPECTATIONS.get_expectation()
|
||||
# fmt: on
|
||||
|
||||
self.assertEqual(output_text, EXPECTED_TEXTS)
|
||||
|
||||
@unittest.skip("For now, using a gemma model with the 3n class is not supported")
|
||||
@ -978,6 +1008,7 @@ class Gemma3nIntegrationTest(unittest.TestCase):
|
||||
EXPECTED_COMPLETIONS = [" and I think it's a nice place to visit. This is a nice place. This is", ", green, yellow, orange, purple, pink, brown, black, white.\n\nHere'"] # fmt: skip
|
||||
self.assertEqual(output_text, EXPECTED_COMPLETIONS)
|
||||
|
||||
@require_deterministic_for_xpu
|
||||
def test_generation_beyond_sliding_window_with_generation_config(self):
|
||||
"""Same as `test_generation_beyond_sliding_window`, but passing a GenerationConfig. Regression test for #36684 --
|
||||
ensures `cache_implementation='hybrid'` is correctly inherited from the base `model.generation_config`.
|
||||
@ -1003,5 +1034,15 @@ class Gemma3nIntegrationTest(unittest.TestCase):
|
||||
]
|
||||
output_text = tokenizer.batch_decode(out)
|
||||
|
||||
EXPECTED_COMPLETIONS = [" and I am glad to be here. This is a nice place. This is a nice place.", ", green, yellow, purple, orange, pink, brown, black, white.\n\nHere are"] # fmt: skip
|
||||
# fmt: off
|
||||
EXPECTATIONS = Expectations(
|
||||
{
|
||||
("cuda", None): [" and I am glad to be here. This is a nice place. This is a nice place.", ", green, yellow, purple, orange, pink, brown, black, white.\n\nHere are"],
|
||||
("xpu", None): [" and I think it is very nice. I think it is nice. This is a nice place.", ", green, yellow, purple, orange, pink, brown, black, white.\n\nHere are"],
|
||||
}
|
||||
)
|
||||
|
||||
EXPECTED_COMPLETIONS = EXPECTATIONS.get_expectation()
|
||||
# fmt: on
|
||||
|
||||
self.assertEqual(output_text, EXPECTED_COMPLETIONS)
|
||||
|
Reference in New Issue
Block a user