Fix slow static cache export tests (#40261)

2025-10-21 01:23:56 +08:00 · 2025-08-19 02:24:07 -07:00
parent 56c44213b3
commit 2b59207a72
8 changed files with 16 additions and 16 deletions
--- a/src/transformers/integrations/executorch.py
+++ b/src/transformers/integrations/executorch.py
@ -325,14 +325,14 @@ class TorchExportableModuleForDecoderOnlyLM(torch.nn.Module):
                "input_ids": input_ids,
                "cache_position": cache_position
                if cache_position is not None
-                else torch.arange(input_ids.shape[-1], dtype=torch.long, model=model_device),
+                else torch.arange(input_ids.shape[-1], dtype=torch.long, device=model_device),
            }
        else:  # inputs_embeds
            input_kwargs = {
                "inputs_embeds": inputs_embeds,
                "cache_position": cache_position
                if cache_position is not None
-                else torch.arange(inputs_embeds.shape[1], dtype=torch.long, model=model_device),
+                else torch.arange(inputs_embeds.shape[1], dtype=torch.long, device=model_device),
            }

        exported_program = torch.export.export(
--- a/tests/models/gemma/test_modeling_gemma.py
+++ b/tests/models/gemma/test_modeling_gemma.py
@ -463,8 +463,8 @@ class GemmaIntegrationTest(unittest.TestCase):

        exportable_module = TorchExportableModuleForDecoderOnlyLM(model)
        exported_program = exportable_module.export(
-            input_ids=prompt_token_ids,
-            cache_position=torch.arange(prompt_token_ids.shape[-1], dtype=torch.long, device=model.device),
+            input_ids=torch.tensor([[1]], dtype=torch.long, device=model.device),
+            cache_position=torch.tensor([0], dtype=torch.long, device=model.device),
        )
        ep_generated_ids = TorchExportableModuleWithStaticCache.generate(
            exported_program=exported_program, prompt_token_ids=prompt_token_ids, max_new_tokens=max_new_tokens
--- a/tests/models/gemma2/test_modeling_gemma2.py
+++ b/tests/models/gemma2/test_modeling_gemma2.py
@ -368,8 +368,8 @@ class Gemma2IntegrationTest(unittest.TestCase):

        exportable_module = TorchExportableModuleForDecoderOnlyLM(model)
        exported_program = exportable_module.export(
-            input_ids=prompt_token_ids,
-            cache_position=torch.arange(prompt_token_ids.shape[-1], dtype=torch.long, device=model.device),
+            input_ids=torch.tensor([[1]], dtype=torch.long, device=model.device),
+            cache_position=torch.tensor([0], dtype=torch.long, device=model.device),
        )
        ep_generated_ids = TorchExportableModuleWithStaticCache.generate(
            exported_program=exported_program, prompt_token_ids=prompt_token_ids, max_new_tokens=max_new_tokens
--- a/tests/models/llama/test_modeling_llama.py
+++ b/tests/models/llama/test_modeling_llama.py
@ -354,8 +354,8 @@ class LlamaIntegrationTest(unittest.TestCase):

            exportable_module = TorchExportableModuleForDecoderOnlyLM(model)
            exported_program = exportable_module.export(
-                input_ids=prompt_token_ids,
-                cache_position=torch.arange(prompt_token_ids.shape[-1], dtype=torch.long, device=model.device),
+                input_ids=torch.tensor([[1]], dtype=torch.long, device=model.device),
+                cache_position=torch.tensor([0], dtype=torch.long, device=model.device),
            )
            ep_generated_ids = TorchExportableModuleWithStaticCache.generate(
                exported_program=exported_program, prompt_token_ids=prompt_token_ids, max_new_tokens=max_new_tokens
--- a/tests/models/olmo/test_modeling_olmo.py
+++ b/tests/models/olmo/test_modeling_olmo.py
@ -387,8 +387,8 @@ class OlmoIntegrationTest(unittest.TestCase):

        exportable_module = TorchExportableModuleForDecoderOnlyLM(model)
        exported_program = exportable_module.export(
-            input_ids=prompt_token_ids,
-            cache_position=torch.arange(prompt_token_ids.shape[-1], dtype=torch.long, device=model.device),
+            input_ids=torch.tensor([[1]], dtype=torch.long, device=model.device),
+            cache_position=torch.tensor([0], dtype=torch.long, device=model.device),
        )
        ep_generated_ids = TorchExportableModuleWithStaticCache.generate(
            exported_program=exported_program, prompt_token_ids=prompt_token_ids, max_new_tokens=max_new_tokens
--- a/tests/models/phi3/test_modeling_phi3.py
+++ b/tests/models/phi3/test_modeling_phi3.py
@ -415,8 +415,8 @@ class Phi3IntegrationTest(unittest.TestCase):

        exportable_module = TorchExportableModuleForDecoderOnlyLM(model)
        exported_program = exportable_module.export(
-            input_ids=prompt_token_ids,
-            cache_position=torch.arange(prompt_token_ids.shape[-1], dtype=torch.long, device=model.device),
+            input_ids=torch.tensor([[1]], dtype=torch.long, device=model.device),
+            cache_position=torch.tensor([0], dtype=torch.long, device=model.device),
        )
        ep_generated_ids = TorchExportableModuleWithStaticCache.generate(
            exported_program=exported_program, prompt_token_ids=prompt_token_ids, max_new_tokens=max_new_tokens
--- a/tests/models/qwen2/test_modeling_qwen2.py
+++ b/tests/models/qwen2/test_modeling_qwen2.py
@ -305,8 +305,8 @@ class Qwen2IntegrationTest(unittest.TestCase):
            "2.7.0"
        )  # Due to https://github.com/pytorch/pytorch/issues/150994
        exported_program = exportable_module.export(
-            input_ids=prompt_token_ids,
-            cache_position=torch.arange(prompt_token_ids.shape[-1], dtype=torch.long, device=model.device),
+            input_ids=torch.tensor([[1]], dtype=torch.long, device=model.device),
+            cache_position=torch.tensor([0], dtype=torch.long, device=model.device),
            strict=strict,
        )
        ep_generated_ids = TorchExportableModuleWithStaticCache.generate(
--- a/tests/models/qwen3/test_modeling_qwen3.py
+++ b/tests/models/qwen3/test_modeling_qwen3.py
@ -295,8 +295,8 @@ class Qwen3IntegrationTest(unittest.TestCase):

        exportable_module = TorchExportableModuleForDecoderOnlyLM(model)
        exported_program = exportable_module.export(
-            input_ids=prompt_token_ids,
-            cache_position=torch.arange(prompt_token_ids.shape[-1], dtype=torch.long, device=model.device),
+            input_ids=torch.tensor([[1]], dtype=torch.long, device=model.device),
+            cache_position=torch.tensor([0], dtype=torch.long, device=model.device),
            strict=strict,
        )
        ep_generated_ids = TorchExportableModuleWithStaticCache.generate(