extend gemma3n integration ut cases on XPU (#41071)

Signed-off-by: Yao, Matrix <matrix.yao@intel.com>
2025-10-20 17:13:56 +08:00 · 2025-09-25 06:46:37 -07:00
parent 05fb90c969
commit 1dd22a234c
1 changed files with 52 additions and 11 deletions
--- a/tests/models/gemma3n/test_modeling_gemma3n.py
+++ b/tests/models/gemma3n/test_modeling_gemma3n.py
@ -37,10 +37,12 @@ from transformers import (
    is_torch_available,
 )
 from transformers.testing_utils import (
+    Expectations,
    cleanup,
+    require_deterministic_for_xpu,
    require_read_token,
    require_torch,
-    require_torch_gpu,
+    require_torch_accelerator,
    set_config_for_less_flaky_test,
    set_model_for_less_flaky_test,
    slow,
@ -217,8 +219,6 @@ class Gemma3nAudioModelTest(ModelTesterMixin, unittest.TestCase):
        self.assertEqual(input_features.shape, self.expected_input_features_shape)
        np.testing.assert_allclose(input_features[0, 0, :5], self.expected_input_features_slice, rtol=1e-5, atol=1e-5)

-        print(input_features[0, 0, :5])
-
        input_features_mask = audio_inputs["input_features_mask"]
        self.assertEqual(input_features_mask.shape, self.expected_input_features_mask_shape)
        # The second audio sample is shorter (22 frames vs 48), so its mask should become False at index 22
@ -235,8 +235,6 @@ class Gemma3nAudioModelTest(ModelTesterMixin, unittest.TestCase):
        with torch.no_grad():
            encoder_output, encoder_mask = model(**inputs_dict)

-        print(encoder_output[0, 0, :5])
-
        # Check output encodings
        self.assertEqual(encoder_output.shape, self.expected_encoder_output_shape)
        torch.testing.assert_close(
@ -745,7 +743,7 @@ class Gemma3nVision2TextModelTest(ModelTesterMixin, GenerationTesterMixin, unitt


@slow
-@require_torch_gpu
+@require_torch_accelerator
@require_read_token
 class Gemma3nIntegrationTest(unittest.TestCase):
    def setUp(self):
@ -766,7 +764,7 @@ class Gemma3nIntegrationTest(unittest.TestCase):
        audio_ds = load_dataset(
            "etechgrid/28.5k_wavfiles_dataset", "default", data_files="wav_dataset/103-1240-0000.wav"
        )
-        self.audio_file_path = audio_ds["train"][0]["audio"].metadata.path
+        self.audio_file_path = audio_ds["train"][0]["audio"]["path"]
        cleanup(torch_device, gc_collect=True)

    def tearDown(self):
@ -869,7 +867,17 @@ class Gemma3nIntegrationTest(unittest.TestCase):
        output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
        output_text = self.processor.batch_decode(output, skip_special_tokens=True)

-        EXPECTED_TEXTS = ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach next to a clear blue ocean. The cow is facing the viewer with its head slightly', "user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, the images are not identical. \n\nHere's a breakdown of the differences:\n\n* **Subject:** The first image features a cow"]  # fmt: skip
+        # fmt: off
+        EXPECTATIONS = Expectations(
+            {
+                ("cuda", None): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach next to a clear blue ocean. The cow is facing the viewer with its head slightly', "user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, the images are not identical. \n\nHere's a breakdown of the differences:\n\n* **Subject:** The first image features a cow"],
+                ("xpu", None): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach next to a turquoise ocean. The cow is facing the viewer with its head slightly turned', "user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, the images are not identical. \n\nHere's a breakdown of the differences:\n\n* **Subject:** The first image features a cow"],
+            }
+        )
+
+        EXPECTED_TEXTS = EXPECTATIONS.get_expectation()
+        # fmt: on
+
        self.assertEqual(output_text, EXPECTED_TEXTS)

    def test_model_4b_image(self):
@ -891,10 +899,22 @@ class Gemma3nIntegrationTest(unittest.TestCase):
        output_text = self.processor.batch_decode(output, skip_special_tokens=True)

        EXPECTED_NUM_IMAGES = 1  # Gemma3n does not support crops
-        EXPECTED_TEXTS = ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach next to a clear blue ocean. The cow is facing the viewer with its head slightly']  # fmt: skip
+
+        # fmt: off
+        EXPECTATIONS = Expectations(
+            {
+                ("cuda", None): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach next to a clear blue ocean. The cow is facing the viewer with its head slightly'],
+                ("xpu", None): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach next to a clear blue ocean. The cow is facing the viewer with its head slightly'],
+            }
+        )
+
+        EXPECTED_TEXTS = EXPECTATIONS.get_expectation()
+        # fmt: on
+
        self.assertEqual(len(inputs["pixel_values"]), EXPECTED_NUM_IMAGES)
        self.assertEqual(output_text, EXPECTED_TEXTS)

+    @require_deterministic_for_xpu
    def test_model_4b_multiimage(self):
        model_id = "Google/gemma-3n-E4B-it"

@ -928,7 +948,17 @@ class Gemma3nIntegrationTest(unittest.TestCase):
        output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
        output_text = self.processor.batch_decode(output, skip_special_tokens=True)

-        EXPECTED_TEXTS = ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nIn the image, I see a street scene in what appears to be a Chinatown district. Here are some key elements:\n\n* **A prominent red']  # fmt: skip
+        # fmt: off
+        EXPECTATIONS = Expectations(
+            {
+                ("cuda", None): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nIn the image, I see a street scene in what appears to be a Chinatown district. Here are some key elements:\n\n* **A prominent red'],
+                ("xpu", None): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nIn the image, I see a street scene in what appears to be a Chinatown district. Here are the key elements:\n\n* **A prominent red'],
+            }
+        )
+
+        EXPECTED_TEXTS = EXPECTATIONS.get_expectation()
+        # fmt: on
+
        self.assertEqual(output_text, EXPECTED_TEXTS)

    @unittest.skip("For now, using a gemma model with the 3n class is not supported")
@ -978,6 +1008,7 @@ class Gemma3nIntegrationTest(unittest.TestCase):
        EXPECTED_COMPLETIONS = [" and I think it's a nice place to visit. This is a nice place. This is", ", green, yellow, orange, purple, pink, brown, black, white.\n\nHere'"]  # fmt: skip
        self.assertEqual(output_text, EXPECTED_COMPLETIONS)

+    @require_deterministic_for_xpu
    def test_generation_beyond_sliding_window_with_generation_config(self):
        """Same as `test_generation_beyond_sliding_window`, but passing a GenerationConfig. Regression test for #36684 --
        ensures `cache_implementation='hybrid'` is correctly inherited from the base `model.generation_config`.
@ -1003,5 +1034,15 @@ class Gemma3nIntegrationTest(unittest.TestCase):
        ]
        output_text = tokenizer.batch_decode(out)

-        EXPECTED_COMPLETIONS = [" and I am glad to be here. This is a nice place. This is a nice place.", ", green, yellow, purple, orange, pink, brown, black, white.\n\nHere are"]  # fmt: skip
+        # fmt: off
+        EXPECTATIONS = Expectations(
+            {
+                ("cuda", None): [" and I am glad to be here. This is a nice place. This is a nice place.", ", green, yellow, purple, orange, pink, brown, black, white.\n\nHere are"],
+                ("xpu", None): [" and I think it is very nice. I think it is nice. This is a nice place.", ", green, yellow, purple, orange, pink, brown, black, white.\n\nHere are"],
+            }
+        )
+
+        EXPECTED_COMPLETIONS = EXPECTATIONS.get_expectation()
+        # fmt: on
+
        self.assertEqual(output_text, EXPECTED_COMPLETIONS)