Files
DeepSpeed/deepspeed/ops/transformer/inference/diffusers_transformer_block.py
Nir Sonnenschein 1a8ad24f0d fix issues raised by Coverity scans (#7431)
This commit combines fixes for 37 potential code issues found in
Coverity scans.
the issues include but are not limited to potential access to
uninitialized variables, dead and redundant code.
We understand that reviewing such a commit can be difficult and will be
happy to help with any questions or changes required.

---------

Signed-off-by: Nir Sonnenschein <nsonnenschein@habana.ai>
Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
Co-authored-by: Olatunji Ruwase <tunji.ruwase@snowflake.com>
2025-08-02 12:16:10 -04:00

88 lines
4.2 KiB
Python

# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
import torch
import torch.nn as nn
from deepspeed import module_inject
from .diffusers_attention import DeepSpeedDiffusersAttention
from .bias_add import nhwc_bias_add
from .diffusers_2d_transformer import Diffusers2DTransformerConfig
from deepspeed.utils.types import ActivationFuncType
from .op_binding.gated_activation import GatedActivationOp
from .op_binding.layer_norm import LayerNormOp
class DeepSpeedDiffusersTransformerBlock(nn.Module):
def __init__(self, equivalent_module: nn.Module, config: Diffusers2DTransformerConfig):
super(DeepSpeedDiffusersTransformerBlock, self).__init__()
self.quantizer = module_inject.GroupQuantizer(q_int8=config.int8_quantization)
# Ensure ops are built by the time we start running
self.config = config
self.ff1_w = self.quantizer.quantize(
nn.Parameter(equivalent_module.ff.net[0].proj.weight.data, requires_grad=False))
self.ff1_b = nn.Parameter(equivalent_module.ff.net[0].proj.bias.data, requires_grad=False)
self.ff2_w = self.quantizer.quantize(nn.Parameter(equivalent_module.ff.net[2].weight.data,
requires_grad=False))
self.ff2_b = nn.Parameter(equivalent_module.ff.net[2].bias.data, requires_grad=False)
self.norm1_g = nn.Parameter(equivalent_module.norm1.weight.data, requires_grad=False)
self.norm1_b = nn.Parameter(equivalent_module.norm1.bias.data, requires_grad=False)
self.norm1_eps = equivalent_module.norm1.eps
self.norm2_g = nn.Parameter(equivalent_module.norm2.weight.data, requires_grad=False)
self.norm2_b = nn.Parameter(equivalent_module.norm2.bias.data, requires_grad=False)
self.norm2_eps = equivalent_module.norm2.eps
self.norm3_g = nn.Parameter(equivalent_module.norm3.weight.data, requires_grad=False)
self.norm3_b = nn.Parameter(equivalent_module.norm3.bias.data, requires_grad=False)
self.norm3_eps = equivalent_module.norm3.eps
self.attn_1 = equivalent_module.attn1
self.attn_2 = equivalent_module.attn2
# Pull the bias in if we can
if isinstance(self.attn_1, DeepSpeedDiffusersAttention):
self.attn_1.do_out_bias = False
self.attn_1_bias = self.attn_1.attn_ob
else:
self.attn_1_bias = nn.Parameter(torch.zeros_like(self.norm2_g), requires_grad=False)
# Pull the bias in if we can
if isinstance(self.attn_2, DeepSpeedDiffusersAttention):
self.attn_2.do_out_bias = False
self.attn_2_bias = self.attn_2.attn_ob
else:
self.attn_2_bias = nn.Parameter(torch.zeros_like(self.norm3_g), requires_grad=False)
self.gated_activation = GatedActivationOp()
self.layer_norm = LayerNormOp()
def forward(self, hidden_states, context=None, timestep=None, **kwargs):
# In v0.12.0 of diffuser, several new kwargs were added. Capturing
# those with kwargs to maintain backward compatibility
# In v0.11.0 of diffusers, the kwarg was changed from 'context' to 'encoder_hidden_states'
# This is so we can support older and newer versions of diffusers
if "encoder_hidden_states" in kwargs and kwargs["encoder_hidden_states"] is not None:
context = kwargs["encoder_hidden_states"]
out_norm_1 = self.layer_norm(hidden_states, self.norm1_g, self.norm1_b, self.norm1_eps)
out_attn_1 = self.attn_1(out_norm_1)
out_norm_2, out_attn_1 = self.layer_norm.layer_norm_residual_store_pre_ln_res(
out_attn_1, self.attn_1_bias, hidden_states, self.norm2_g, self.norm2_b, self.norm2_eps)
out_attn_2 = self.attn_2(out_norm_2, context=context)
out_norm_3, out_attn_2 = self.layer_norm.layer_norm_residual_store_pre_ln_res(
out_attn_2, self.attn_2_bias, out_attn_1, self.norm3_g, self.norm3_b, self.norm3_eps)
out_ff1 = nn.functional.linear(out_norm_3, self.ff1_w)
out_geglu = self.gated_activation(out_ff1, self.ff1_b, ActivationFuncType.GATED_GELU)
out_ff2 = nn.functional.linear(out_geglu, self.ff2_w)
return nhwc_bias_add(out_ff2, self.ff2_b, other=out_attn_2)