mirror of
https://github.com/deepspeedai/DeepSpeed.git
synced 2025-10-20 15:33:51 +08:00
56 lines
2.0 KiB
Python
56 lines
2.0 KiB
Python
'''Copyright The Microsoft DeepSpeed Team'''
|
|
|
|
from .base import *
|
|
from deepspeed.model_implementations.transformers.ds_gpt import DeepSpeedGPTInference
|
|
from ..policy import TransformerPolicy
|
|
|
|
|
|
class DS_GPT2Container(BaseTransformerContainer):
|
|
|
|
def __init__(self, **kwargs):
|
|
super().__init__(**kwargs)
|
|
|
|
# All model specific things should be defined here instead of the base class.
|
|
|
|
def create_module(self, config=None):
|
|
_config = config if config is not None else self.ds_model_config
|
|
self.module = DeepSpeedGPTInference(_config, mp_group=self.mp_group)
|
|
self.module.config.scale_attention = self.scale_attention
|
|
return self.module
|
|
|
|
|
|
class HFGPT2LayerPolicy(TransformerPolicy):
|
|
_orig_layer_class = None
|
|
|
|
def __init__(self, client_module, inference=True):
|
|
# HuggingFace GPT2 uses convolutional layer instead of linear layer
|
|
super().__init__(inference, linear_layer=False)
|
|
self.client_module = client_module
|
|
try:
|
|
import transformers
|
|
HFGPT2LayerPolicy._orig_layer_class = transformers.models.gpt2.modeling_gpt2.GPT2Block
|
|
except:
|
|
HFGPT2LayerPolicy._orig_layer_class = None
|
|
|
|
def get_hidden_heads(self):
|
|
return self.client_module.attn.embed_dim, \
|
|
self.client_module.attn.num_heads
|
|
|
|
def attention(self):
|
|
return self.client_module.attn.c_attn.weight, \
|
|
self.client_module.attn.c_attn.bias, \
|
|
self.client_module.attn.c_proj.weight, \
|
|
self.client_module.attn.c_proj.bias
|
|
|
|
def mlp(self):
|
|
return self.client_module.mlp.c_fc.weight, \
|
|
self.client_module.mlp.c_fc.bias, \
|
|
self.client_module.mlp.c_proj.weight, \
|
|
self.client_module.mlp.c_proj.bias
|
|
|
|
def layernorm(self):
|
|
return self.client_module.ln_2.weight, \
|
|
self.client_module.ln_2.bias, \
|
|
self.client_module.ln_1.weight, \
|
|
self.client_module.ln_1.bias
|