DeepSpeed/deepspeed/module_inject/containers/megatron_gpt_moe.py

'''Copyright The Microsoft DeepSpeed Team'''

from .base import *
from .base_moe import *
from .features.megatron import MegatronContainer
from deepspeed.model_implementations.transformers.ds_megatron_gpt import DeepSpeedMegatronGPTInference
import torch
from .megatron_gpt import MegatronLayerPolicy
from packaging import version as pkg_version


class DS_MegatronGPTMoEContainer(MegatronContainer, BaseTransformerMoEContainer):

    def __init__(self, policy, config, model_config, layer_id):
        super().__init__(policy, config, model_config, layer_id)

        # All model specific things should be defined here instead of the base class.

    def create_module(self, config=None):
        _config = config if config is not None else self.ds_model_config
        self.module = DeepSpeedMegatronGPTInference(_config, mp_group=self.mp_group)
        self.module.config.scale_attention = self.scale_attention

        if self.megatron_v2:
            self.module.config.rotate_half = True
            self.module.config.rotate_every_two = False

        return self.module


# TODO: Megatron GPT MoE inherits from Megatron policy and replaces mlp
# TODO: Generalize MoE overall goal, expand beyond Megatron
class MegatronMoELayerPolicy(MegatronLayerPolicy):
    _orig_layer_class = None
    version = 0
    moe_type = 'standard'
    num_experts = 1

    def __init__(self, client_module, inference=True):
        super().__init__(inference)
        self.client_module = client_module
        # we use megatron version to differentiate between the old and new
        # megatron-lm source code
        if MegatronMoELayerPolicy._orig_layer_class is None:
            if pkg_version.parse(torch.__version__) <= pkg_version.parse("1.2"):
                MegatronMoELayerPolicy._orig_layer_class = None
            else:
                try:
                    from megatron.model.transformer import ParallelTransformerLayer
                    MegatronMoELayerPolicy._orig_layer_class = ParallelTransformerLayer
                except ImportError:
                    MegatronMoELayerPolicy._orig_layer_class = None

    def get_num_experts(self):
        return self.num_experts

    def mlp(self, moe_type='standard'):
        # for now, all of this is tightly coupled to megatron-deepspeed moe implementation
        # todo: think and refactor this to be more general

        #from deepspeed.moe.utils import has_moe_layers
        #moe, _ = has_moe_layers(self.client_module)

        moe_experts = self.client_module.mlp.deepspeed_moe.experts.deepspeed_experts if moe_type == 'standard' else \
                        self.client_module.mlp.moe.deepspeed_moe.experts.deepspeed_experts
        num_experts = len(moe_experts)
        self.num_experts = num_experts

        if moe_type == 'standard':
            return [moe_experts[i].dense_h_to_4h.weight for i in range(num_experts)], \
                    [moe_experts[i].dense_h_to_4h.bias for i in range(num_experts)], \
                    [moe_experts[i].dense_4h_to_h.weight for i in range(num_experts)], \
                    [moe_experts[i].dense_4h_to_h.bias for i in range(num_experts)]
        else:
            return [moe_experts[i].dense_h_to_4h.weight for i in range(num_experts)], \
                    [moe_experts[i].dense_h_to_4h.bias for i in range(num_experts)], \
                    [moe_experts[i].dense_4h_to_h.weight for i in range(num_experts)], \
                    [moe_experts[i].dense_4h_to_h.bias for i in range(num_experts)], \
                    self.client_module.mlp.mlp.dense_h_to_4h.weight, \
                    self.client_module.mlp.mlp.dense_h_to_4h.bias, \
                    self.client_module.mlp.mlp.dense_4h_to_h.weight, \
                    self.client_module.mlp.mlp.dense_4h_to_h.bias, \
                    self.client_module.mlp.coefficient.weight