Enable fa in amd docker (#41069)

* Add FA to docker

* Use caching mechanism for qwen2_5

* Fix a typo in important models list

* Partial fixes for gemma3

* Added a commit ID for FA repo

* Detailled  the expectation storage format

* Rebase fix

* Apply style fixes

---------

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
This commit is contained in:
Rémi Ouazan
2025-09-26 13:57:58 +02:00
committed by GitHub
parent 10f6891fc5
commit 50d2448a1a
5 changed files with 41 additions and 51 deletions

View File

@ -35,3 +35,10 @@ RUN python3 -m pip uninstall -y kernels
# On ROCm, torchcodec is required to decode audio files and 0.4 or 0.6 fails
RUN python3 -m pip install --no-cache-dir "torchcodec==0.5"
# Install flash attention from source. Tested with commit 6387433156558135a998d5568a9d74c1778666d8
RUN git clone https://github.com/ROCm/flash-attention/ -b tridao && \
cd flash-attention && \
GPU_ARCHS="gfx942" python setup.py install
RUN python3 -m pip install --no-cache-dir einops

View File

@ -3244,7 +3244,9 @@ def unpack_device_properties(
class Expectations(UserDict[PackedDeviceProperties, Any]):
def get_expectation(self) -> Any:
"""
Find best matching expectation based on environment device properties.
Find best matching expectation based on environment device properties. We look at device_type, major and minor
versions of the drivers. Expectations are stored as a dictionary with keys of the form
(device_type, (major, minor)). If the major and minor versions are not provided, we use None.
"""
return self.find_expectation(get_device_properties())

View File

@ -785,7 +785,10 @@ class Gemma3nIntegrationTest(unittest.TestCase):
output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
output_text = self.processor.batch_decode(output, skip_special_tokens=True)
EXPECTED_TEXTS = ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach next to a clear blue ocean. The cow is facing the viewer with its head slightly'] # fmt: skip
EXPECTED_TEXTS = Expectations({
("cuda", None): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach next to a clear blue ocean. The cow is facing the viewer with its head slightly'],
("rocm", (9, 4)): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach next to a turquoise ocean. The sky is blue with a few white clouds. The'],
}).get_expectation() # fmt: skip
self.assertEqual(output_text, EXPECTED_TEXTS)
def test_model_with_audio(self):
@ -866,18 +869,11 @@ class Gemma3nIntegrationTest(unittest.TestCase):
output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
output_text = self.processor.batch_decode(output, skip_special_tokens=True)
# fmt: off
EXPECTATIONS = Expectations(
{
("cuda", None): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach next to a clear blue ocean. The cow is facing the viewer with its head slightly', "user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, the images are not identical. \n\nHere's a breakdown of the differences:\n\n* **Subject:** The first image features a cow"],
("xpu", None): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach next to a turquoise ocean. The cow is facing the viewer with its head slightly turned', "user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, the images are not identical. \n\nHere's a breakdown of the differences:\n\n* **Subject:** The first image features a cow"],
}
)
EXPECTED_TEXTS = EXPECTATIONS.get_expectation()
# fmt: on
EXPECTED_TEXTS = Expectations({
("cuda", None): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach next to a clear blue ocean. The cow is facing the viewer with its head slightly', "user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, the images are not identical. \n\nHere's a breakdown of the differences:\n\n* **Subject:** The first image features a cow"],
("rocm", (9, 4)): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach next to a clear blue ocean. The cow is facing the viewer with its head slightly', "user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, the images are not identical. \n\nHere's a breakdown of the differences:\n\n* **Subject Matter:** The first image shows a"],
("xpu", None): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach next to a turquoise ocean. The cow is facing the viewer with its head slightly turned', "user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, the images are not identical. \n\nHere's a breakdown of the differences:\n\n* **Subject:** The first image features a cow"],
}).get_expectation() # fmt: skip
self.assertEqual(output_text, EXPECTED_TEXTS)
def test_model_4b_image(self):
@ -899,18 +895,11 @@ class Gemma3nIntegrationTest(unittest.TestCase):
output_text = self.processor.batch_decode(output, skip_special_tokens=True)
EXPECTED_NUM_IMAGES = 1 # Gemma3n does not support crops
# fmt: off
EXPECTATIONS = Expectations(
{
("cuda", None): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach next to a clear blue ocean. The cow is facing the viewer with its head slightly'],
("xpu", None): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach next to a clear blue ocean. The cow is facing the viewer with its head slightly'],
}
)
EXPECTED_TEXTS = EXPECTATIONS.get_expectation()
# fmt: on
EXPECTED_TEXTS = Expectations({
("cuda", None): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach next to a clear blue ocean. The cow is facing the viewer with its head slightly'],
("xpu", None): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach next to a clear blue ocean. The cow is facing the viewer with its head slightly'],
("rocm", (9, 4)): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach next to a turquoise ocean. The sky is blue with a few white clouds. The'],
}).get_expectation() # fmt: skip
self.assertEqual(len(inputs["pixel_values"]), EXPECTED_NUM_IMAGES)
self.assertEqual(output_text, EXPECTED_TEXTS)
@ -948,17 +937,11 @@ class Gemma3nIntegrationTest(unittest.TestCase):
output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
output_text = self.processor.batch_decode(output, skip_special_tokens=True)
# fmt: off
EXPECTATIONS = Expectations(
{
("cuda", None): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nIn the image, I see a street scene in what appears to be a Chinatown district. Here are some key elements:\n\n* **A prominent red'],
("xpu", None): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nIn the image, I see a street scene in what appears to be a Chinatown district. Here are the key elements:\n\n* **A prominent red'],
}
)
EXPECTED_TEXTS = EXPECTATIONS.get_expectation()
# fmt: on
EXPECTED_TEXTS = Expectations({
("cuda", None): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nIn the image, I see a street scene in what appears to be a Chinatown district. Here are some key elements:\n\n* **A prominent red'],
("xpu", None): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nIn the image, I see a street scene in what appears to be a Chinatown district. Here are the key elements:\n\n* **A prominent red'],
("rocm", (9, 4)): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nIn the image, I see a street scene in what appears to be a Chinatown district. \n\nHere are some key elements:\n\n* **A'],
}).get_expectation() # fmt: skip
self.assertEqual(output_text, EXPECTED_TEXTS)
@unittest.skip("For now, using a gemma model with the 3n class is not supported")
@ -1034,15 +1017,10 @@ class Gemma3nIntegrationTest(unittest.TestCase):
]
output_text = tokenizer.batch_decode(out)
# fmt: off
EXPECTATIONS = Expectations(
{
("cuda", None): [" and I am glad to be here. This is a nice place. This is a nice place.", ", green, yellow, purple, orange, pink, brown, black, white.\n\nHere are"],
("xpu", None): [" and I think it is very nice. I think it is nice. This is a nice place.", ", green, yellow, purple, orange, pink, brown, black, white.\n\nHere are"],
}
)
EXPECTED_COMPLETIONS = EXPECTATIONS.get_expectation()
# fmt: on
EXPECTED_COMPLETIONS = Expectations({
# FIXME: This test is VERY flaky on ROCm
("cuda", None): [" and I am glad to be here. This is a nice place. This is a nice place.", ", green, yellow, purple, orange, pink, brown, black, white.\n\nHere are"],
("rocm", (9, 4)): [' and I think it makes this place special. This is a nice place. This is a nice place', ', green, yellow, purple, orange, pink, brown, black, white.\n\nHere are'],
("xpu", None): [" and I think it is very nice. I think it is nice. This is a nice place.", ", green, yellow, purple, orange, pink, brown, black, white.\n\nHere are"],
}).get_expectation() # fmt: skip
self.assertEqual(output_text, EXPECTED_COMPLETIONS)

View File

@ -27,6 +27,7 @@ from transformers import (
is_torch_available,
is_vision_available,
)
from transformers.image_utils import load_image
from transformers.testing_utils import (
Expectations,
cleanup,
@ -47,6 +48,7 @@ from ...test_modeling_common import (
floats_tensor,
ids_tensor,
)
from ...test_processing_common import url_to_local_path
if is_cv2_available():
@ -454,8 +456,8 @@ class Qwen2_5_VLIntegrationTest(unittest.TestCase):
],
}
]
url = "https://qianwen-res.oss-accelerate-overseas.aliyuncs.com/Qwen2-VL/demo_small.jpg"
self.image = Image.open(requests.get(url, stream=True).raw)
img_url = url_to_local_path("https://qianwen-res.oss-accelerate-overseas.aliyuncs.com/Qwen2-VL/demo_small.jpg")
self.image = load_image(img_url).convert("RGB")
cleanup(torch_device, gc_collect=True)

View File

@ -5,7 +5,8 @@ IMPORTANT_MODELS = [
"gpt2",
"t5",
"modernbert",
"vit,clip",
"vit",
"clip",
"detr",
"table_transformer",
"got_ocr2",