Enable fa in amd docker (#41069)

* Add FA to docker * Use caching mechanism for qwen2_5 * Fix a typo in important models list * Partial fixes for gemma3 * Added a commit ID for FA repo * Detailled the expectation storage format * Rebase fix * Apply style fixes --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
2025-10-20 09:03:53 +08:00 · 2025-09-26 13:57:58 +02:00
parent 10f6891fc5
commit 50d2448a1a
5 changed files with 41 additions and 51 deletions
--- a/docker/transformers-pytorch-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-amd-gpu/Dockerfile
@ -35,3 +35,10 @@ RUN python3 -m pip uninstall -y kernels

 # On ROCm, torchcodec is required to decode audio files and 0.4 or 0.6 fails
 RUN python3 -m pip install --no-cache-dir "torchcodec==0.5"
+
+# Install flash attention from source. Tested with commit 6387433156558135a998d5568a9d74c1778666d8
+RUN git clone https://github.com/ROCm/flash-attention/ -b tridao && \
+    cd flash-attention && \
+    GPU_ARCHS="gfx942" python setup.py install
+
+RUN python3 -m pip install --no-cache-dir einops
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@ -3244,7 +3244,9 @@ def unpack_device_properties(
 class Expectations(UserDict[PackedDeviceProperties, Any]):
    def get_expectation(self) -> Any:
        """
-        Find best matching expectation based on environment device properties.
+        Find best matching expectation based on environment device properties. We look at device_type, major and minor
+        versions of the drivers. Expectations are stored as a dictionary with keys of the form
+        (device_type, (major, minor)). If the major and minor versions are not provided, we use None.
        """
        return self.find_expectation(get_device_properties())

--- a/tests/models/gemma3n/test_modeling_gemma3n.py
+++ b/tests/models/gemma3n/test_modeling_gemma3n.py
@ -785,7 +785,10 @@ class Gemma3nIntegrationTest(unittest.TestCase):

        output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
        output_text = self.processor.batch_decode(output, skip_special_tokens=True)
-        EXPECTED_TEXTS = ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach next to a clear blue ocean. The cow is facing the viewer with its head slightly']  # fmt: skip
+        EXPECTED_TEXTS = Expectations({
+            ("cuda", None): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach next to a clear blue ocean. The cow is facing the viewer with its head slightly'],
+            ("rocm", (9, 4)): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach next to a turquoise ocean. The sky is blue with a few white clouds. The'],
+        }).get_expectation()  # fmt: skip
        self.assertEqual(output_text, EXPECTED_TEXTS)

    def test_model_with_audio(self):
@ -866,18 +869,11 @@ class Gemma3nIntegrationTest(unittest.TestCase):

        output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
        output_text = self.processor.batch_decode(output, skip_special_tokens=True)
-
-        # fmt: off
-        EXPECTATIONS = Expectations(
-            {
-                ("cuda", None): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach next to a clear blue ocean. The cow is facing the viewer with its head slightly', "user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, the images are not identical. \n\nHere's a breakdown of the differences:\n\n* **Subject:** The first image features a cow"],
-                ("xpu", None): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach next to a turquoise ocean. The cow is facing the viewer with its head slightly turned', "user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, the images are not identical. \n\nHere's a breakdown of the differences:\n\n* **Subject:** The first image features a cow"],
-            }
-        )
-
-        EXPECTED_TEXTS = EXPECTATIONS.get_expectation()
-        # fmt: on
-
+        EXPECTED_TEXTS = Expectations({
+            ("cuda", None): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach next to a clear blue ocean. The cow is facing the viewer with its head slightly', "user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, the images are not identical. \n\nHere's a breakdown of the differences:\n\n* **Subject:** The first image features a cow"],
+            ("rocm", (9, 4)): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach next to a clear blue ocean. The cow is facing the viewer with its head slightly', "user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, the images are not identical. \n\nHere's a breakdown of the differences:\n\n* **Subject Matter:** The first image shows a"],
+            ("xpu", None): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach next to a turquoise ocean. The cow is facing the viewer with its head slightly turned', "user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, the images are not identical. \n\nHere's a breakdown of the differences:\n\n* **Subject:** The first image features a cow"],
+        }).get_expectation()  # fmt: skip
        self.assertEqual(output_text, EXPECTED_TEXTS)

    def test_model_4b_image(self):
@ -899,18 +895,11 @@ class Gemma3nIntegrationTest(unittest.TestCase):
        output_text = self.processor.batch_decode(output, skip_special_tokens=True)

        EXPECTED_NUM_IMAGES = 1  # Gemma3n does not support crops
-
-        # fmt: off
-        EXPECTATIONS = Expectations(
-            {
-                ("cuda", None): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach next to a clear blue ocean. The cow is facing the viewer with its head slightly'],
-                ("xpu", None): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach next to a clear blue ocean. The cow is facing the viewer with its head slightly'],
-            }
-        )
-
-        EXPECTED_TEXTS = EXPECTATIONS.get_expectation()
-        # fmt: on
-
+        EXPECTED_TEXTS = Expectations({
+            ("cuda", None): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach next to a clear blue ocean. The cow is facing the viewer with its head slightly'],
+            ("xpu", None): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach next to a clear blue ocean. The cow is facing the viewer with its head slightly'],
+            ("rocm", (9, 4)): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach next to a turquoise ocean. The sky is blue with a few white clouds. The'],
+        }).get_expectation()  # fmt: skip
        self.assertEqual(len(inputs["pixel_values"]), EXPECTED_NUM_IMAGES)
        self.assertEqual(output_text, EXPECTED_TEXTS)

@ -948,17 +937,11 @@ class Gemma3nIntegrationTest(unittest.TestCase):
        output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
        output_text = self.processor.batch_decode(output, skip_special_tokens=True)

-        # fmt: off
-        EXPECTATIONS = Expectations(
-            {
-                ("cuda", None): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nIn the image, I see a street scene in what appears to be a Chinatown district. Here are some key elements:\n\n* **A prominent red'],
-                ("xpu", None): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nIn the image, I see a street scene in what appears to be a Chinatown district. Here are the key elements:\n\n* **A prominent red'],
-            }
-        )
-
-        EXPECTED_TEXTS = EXPECTATIONS.get_expectation()
-        # fmt: on
-
+        EXPECTED_TEXTS = Expectations({
+            ("cuda", None): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nIn the image, I see a street scene in what appears to be a Chinatown district. Here are some key elements:\n\n* **A prominent red'],
+            ("xpu", None): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nIn the image, I see a street scene in what appears to be a Chinatown district. Here are the key elements:\n\n* **A prominent red'],
+            ("rocm", (9, 4)): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nIn the image, I see a street scene in what appears to be a Chinatown district. \n\nHere are some key elements:\n\n* **A'],
+        }).get_expectation()  # fmt: skip
        self.assertEqual(output_text, EXPECTED_TEXTS)

    @unittest.skip("For now, using a gemma model with the 3n class is not supported")
@ -1034,15 +1017,10 @@ class Gemma3nIntegrationTest(unittest.TestCase):
        ]
        output_text = tokenizer.batch_decode(out)

-        # fmt: off
-        EXPECTATIONS = Expectations(
-            {
-                ("cuda", None): [" and I am glad to be here. This is a nice place. This is a nice place.", ", green, yellow, purple, orange, pink, brown, black, white.\n\nHere are"],
-                ("xpu", None): [" and I think it is very nice. I think it is nice. This is a nice place.", ", green, yellow, purple, orange, pink, brown, black, white.\n\nHere are"],
-            }
-        )
-
-        EXPECTED_COMPLETIONS = EXPECTATIONS.get_expectation()
-        # fmt: on
-
+        EXPECTED_COMPLETIONS = Expectations({
+            # FIXME: This test is VERY flaky on ROCm
+            ("cuda", None): [" and I am glad to be here. This is a nice place. This is a nice place.", ", green, yellow, purple, orange, pink, brown, black, white.\n\nHere are"],
+            ("rocm", (9, 4)): [' and I think it makes this place special. This is a nice place. This is a nice place', ', green, yellow, purple, orange, pink, brown, black, white.\n\nHere are'],
+            ("xpu", None): [" and I think it is very nice. I think it is nice. This is a nice place.", ", green, yellow, purple, orange, pink, brown, black, white.\n\nHere are"],
+        }).get_expectation()  # fmt: skip
        self.assertEqual(output_text, EXPECTED_COMPLETIONS)
--- a/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py
+++ b/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py
@ -27,6 +27,7 @@ from transformers import (
    is_torch_available,
    is_vision_available,
 )
+from transformers.image_utils import load_image
 from transformers.testing_utils import (
    Expectations,
    cleanup,
@ -47,6 +48,7 @@ from ...test_modeling_common import (
    floats_tensor,
    ids_tensor,
 )
+from ...test_processing_common import url_to_local_path


 if is_cv2_available():
@ -454,8 +456,8 @@ class Qwen2_5_VLIntegrationTest(unittest.TestCase):
                ],
            }
        ]
-        url = "https://qianwen-res.oss-accelerate-overseas.aliyuncs.com/Qwen2-VL/demo_small.jpg"
-        self.image = Image.open(requests.get(url, stream=True).raw)
+        img_url = url_to_local_path("https://qianwen-res.oss-accelerate-overseas.aliyuncs.com/Qwen2-VL/demo_small.jpg")
+        self.image = load_image(img_url).convert("RGB")

        cleanup(torch_device, gc_collect=True)

--- a/utils/important_files.py
+++ b/utils/important_files.py
@ -5,7 +5,8 @@ IMPORTANT_MODELS = [
    "gpt2",
    "t5",
    "modernbert",
-    "vit,clip",
+    "vit",
+    "clip",
    "detr",
    "table_transformer",
    "got_ocr2",