fix

2025-11-12 01:04:36 +08:00 · 2023-12-07 16:27:45 +01:00
1 changed files with 65 additions and 62 deletions
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@ -1831,78 +1831,81 @@ class GenerationTesterMixin:
    def test_generate_continue_from_past_key_values(self):
        # Tests that we can continue generating from past key values, returned from a previous `generate` call
        for model_class in self.all_generative_model_classes:
-            if any(model_name in model_class.__name__.lower() for model_name in ["imagegpt"]):
-                self.skipTest("Won't fix: old model with unique inputs/caches/other")
-            if any(model_name in model_class.__name__.lower() for model_name in ["umt5"]):
-                self.skipTest("TODO: needs modeling or test input preparation fixes for compatibility")

-            config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
+            with self.subTest(f"test_generate_continue_from_past_key_values: model_class = {model_class.__name__}"):

-            if not hasattr(config, "use_cache"):
-                self.skipTest("This model doesn't support caching")
+                if any(model_name in model_class.__name__.lower() for model_name in ["imagegpt"]):
+                    self.skipTest("Won't fix: old model with unique inputs/caches/other")
+                if any(model_name in model_class.__name__.lower() for model_name in ["umt5"]):
+                    self.skipTest("TODO: needs modeling or test input preparation fixes for compatibility")

-            # Let's make it always:
-            # 1. use cache (for obvious reasons)
-            # 2. generate to max length (which can be achieved by setting the eos token to an invalid value), which
-            #    would make the test flaky (e.g. EOS is generated on iteration 1 on both generations, but the
-            #    continuation would force it to generate beyond an EOS token)
-            # 3. ignore `token_type_ids` for simplicity
-            # 4. ignore `forced_eos_token_id`, which requires further manipulation of the continuation inputs and is
-            #    active by default on some models
-            config.use_cache = True
-            if "token_type_ids" in inputs:
-                del inputs["token_type_ids"]
+                config, inputs = self.model_tester.prepare_config_and_inputs_for_common()

-            model = model_class(config).to(torch_device)
-            model.eval()
-            model.generation_config.pad_token_id = model.generation_config.eos_token_id = -1
-            model.generation_config.forced_eos_token_id = None
+                if not hasattr(config, "use_cache"):
+                    self.skipTest("This model doesn't support caching")

-            # If "past_key_values" is not returned, skip the test (e.g. RWKV uses a different cache name and format)
-            outputs = model(**inputs)
-            if "past_key_values" not in outputs:
-                self.skipTest("This model doesn't return `past_key_values`")
+                # Let's make it always:
+                # 1. use cache (for obvious reasons)
+                # 2. generate to max length (which can be achieved by setting the eos token to an invalid value), which
+                #    would make the test flaky (e.g. EOS is generated on iteration 1 on both generations, but the
+                #    continuation would force it to generate beyond an EOS token)
+                # 3. ignore `token_type_ids` for simplicity
+                # 4. ignore `forced_eos_token_id`, which requires further manipulation of the continuation inputs and is
+                #    active by default on some models
+                config.use_cache = True
+                if "token_type_ids" in inputs:
+                    del inputs["token_type_ids"]

-            # Traditional way of generating text, with `return_dict_in_generate` to return the past key values
-            outputs = model.generate(**inputs, do_sample=False, max_new_tokens=4, return_dict_in_generate=True)
+                model = model_class(config).to(torch_device)
+                model.eval()
+                model.generation_config.pad_token_id = model.generation_config.eos_token_id = -1
+                model.generation_config.forced_eos_token_id = None

-            # Let's generate again, but passing the past key values in between (3 + 1 = 4 tokens). Note that the
-            # inputs may need to be tweaked across `generate` calls (like the attention mask).
-            outputs_cached = model.generate(**inputs, do_sample=False, max_new_tokens=3, return_dict_in_generate=True)
+                # If "past_key_values" is not returned, skip the test (e.g. RWKV uses a different cache name and format)
+                outputs = model(**inputs)
+                if "past_key_values" not in outputs:
+                    self.skipTest("This model doesn't return `past_key_values`")

-            # Continue from the tokens generated above, preparing the inputs accordingly
-            inputs["past_key_values"] = outputs_cached.past_key_values
-            new_attention_len = outputs_cached.sequences.shape[-1]
-            if config.is_encoder_decoder:
-                inputs["decoder_input_ids"] = outputs_cached.sequences
-                if "decoder_attention_mask" in inputs:
-                    inputs["decoder_attention_mask"] = torch.nn.functional.pad(
-                        inputs["decoder_attention_mask"],
-                        (0, new_attention_len - inputs["decoder_attention_mask"].shape[1]),
-                        mode="constant",
-                        value=1,
-                    )
-            else:
-                inputs["input_ids"] = outputs_cached.sequences
-                if "attention_mask" in inputs:
-                    inputs["attention_mask"] = torch.nn.functional.pad(
-                        inputs["attention_mask"],
-                        (0, new_attention_len - inputs["attention_mask"].shape[1]),
-                        mode="constant",
-                        value=1,
-                    )
-            outputs_cached = model.generate(**inputs, do_sample=False, max_new_tokens=1, return_dict_in_generate=True)
+                # Traditional way of generating text, with `return_dict_in_generate` to return the past key values
+                outputs = model.generate(**inputs, do_sample=False, max_new_tokens=4, return_dict_in_generate=True)

-            # The two sets of generated text and past kv should be equal to each other
-            self.assertListEqual(outputs.sequences.tolist(), outputs_cached.sequences.tolist())
-            for layer_idx in range(len(outputs_cached.past_key_values)):
-                for kv_idx in range(len(outputs_cached.past_key_values[layer_idx])):
-                    self.assertTrue(
-                        torch.allclose(
-                            outputs.past_key_values[layer_idx][kv_idx],
-                            outputs_cached.past_key_values[layer_idx][kv_idx],
+                # Let's generate again, but passing the past key values in between (3 + 1 = 4 tokens). Note that the
+                # inputs may need to be tweaked across `generate` calls (like the attention mask).
+                outputs_cached = model.generate(**inputs, do_sample=False, max_new_tokens=3, return_dict_in_generate=True)
+
+                # Continue from the tokens generated above, preparing the inputs accordingly
+                inputs["past_key_values"] = outputs_cached.past_key_values
+                new_attention_len = outputs_cached.sequences.shape[-1]
+                if config.is_encoder_decoder:
+                    inputs["decoder_input_ids"] = outputs_cached.sequences
+                    if "decoder_attention_mask" in inputs:
+                        inputs["decoder_attention_mask"] = torch.nn.functional.pad(
+                            inputs["decoder_attention_mask"],
+                            (0, new_attention_len - inputs["decoder_attention_mask"].shape[1]),
+                            mode="constant",
+                            value=1,
+                        )
+                else:
+                    inputs["input_ids"] = outputs_cached.sequences
+                    if "attention_mask" in inputs:
+                        inputs["attention_mask"] = torch.nn.functional.pad(
+                            inputs["attention_mask"],
+                            (0, new_attention_len - inputs["attention_mask"].shape[1]),
+                            mode="constant",
+                            value=1,
+                        )
+                outputs_cached = model.generate(**inputs, do_sample=False, max_new_tokens=1, return_dict_in_generate=True)
+
+                # The two sets of generated text and past kv should be equal to each other
+                self.assertListEqual(outputs.sequences.tolist(), outputs_cached.sequences.tolist())
+                for layer_idx in range(len(outputs_cached.past_key_values)):
+                    for kv_idx in range(len(outputs_cached.past_key_values[layer_idx])):
+                        self.assertTrue(
+                            torch.allclose(
+                                outputs.past_key_values[layer_idx][kv_idx],
+                                outputs_cached.past_key_values[layer_idx][kv_idx],
+                            )
                        )
-                    )

    def _check_outputs(self, output, input_ids, config, use_cache=False, num_return_sequences=1):
        batch_size, seq_length = input_ids.shape