Adapt to the SDPA interface to enable the NPU to call FlashAttentionScore.

2025-09-19 11:03:23 +08:00
1 changed files with 11 additions and 1 deletions
--- a/src/transformers/integrations/sdpa_attention.py
+++ b/src/transformers/integrations/sdpa_attention.py
@ -2,7 +2,7 @@ from typing import Optional
 import torch
-from ..utils import is_torch_xpu_available, logging
+from ..utils import is_torch_xpu_available, is_torch_npu_available, logging
 from ..utils.import_utils import is_torch_greater_or_equal
@ -80,6 +80,16 @@ def sdpa_attention_forward(
    if torch.jit.is_tracing() and isinstance(is_causal, torch.Tensor):
        is_causal = is_causal.item()
    # By default, when passing parameters, the sdpa interface of Ascend NPU cannot invoke the FlashAttentionScore operator
    # but instead uses internal small operator concatenation. To enter FlashAttentionScore, the following conditions must be met:
    # enable is_causal and set attention_mask to None; or disable is_causal and set attention_mask to a boolean type.
    # So we adapt the parameters to allow it entry the FlashAttentionScore.
    if is_torch_npu_available():
        if is_causal:
            attention_mask = None
        else:
            attention_mask = torch.logical_not(attention_mask.bool()).to(query.device)
    attn_output = torch.nn.functional.scaled_dot_product_attention(
        query,
        key,