mirror of
https://github.com/deepspeedai/DeepSpeed.git
synced 2025-10-20 15:33:51 +08:00
110 lines
4.6 KiB
Python
Executable File
110 lines
4.6 KiB
Python
Executable File
'''Copyright The Microsoft DeepSpeed Team'''
|
|
|
|
import copy
|
|
import torch
|
|
from deepspeed.ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
|
|
|
|
|
|
def module_inject(layer_obj, model, config, micro_batch_size, max_seq_length, seed, preln, fp16=True):
|
|
for name, child in model.named_children():
|
|
if isinstance(child, layer_obj):
|
|
print('REPLACING BertLayer')
|
|
|
|
cuda_config = DeepSpeedTransformerConfig(batch_size=micro_batch_size,
|
|
max_seq_length=max_seq_length,
|
|
hidden_size=config.hidden_size,
|
|
heads=config.num_attention_heads,
|
|
attn_dropout_ratio=config.attention_probs_dropout_prob,
|
|
hidden_dropout_ratio=config.hidden_dropout_prob,
|
|
num_hidden_layers=config.num_hidden_layers,
|
|
initializer_range=config.initializer_range,
|
|
seed=seed,
|
|
fp16=fp16,
|
|
pre_layer_norm=preln)
|
|
|
|
new_module = DeepSpeedTransformerLayer(cuda_config)
|
|
|
|
# copy relevant state from child -> new module
|
|
qw = child.attention.self.query.weight
|
|
qb = child.attention.self.query.bias
|
|
kw = child.attention.self.key.weight
|
|
kb = child.attention.self.key.bias
|
|
vw = child.attention.self.value.weight
|
|
vb = child.attention.self.value.bias
|
|
|
|
qkvw = torch.cat((qw, kw, vw), 0)
|
|
qkvb = torch.cat((qb, kb, vb), 0)
|
|
|
|
new_module.attn_qkvw.data = qkvw
|
|
new_module.attn_qkvb.data = qkvb
|
|
new_module.attn_ow.data = child.attention.output.dense.weight
|
|
new_module.attn_ob.data = child.attention.output.dense.bias
|
|
if preln:
|
|
attention_layerNorm = child.PostAttentionLayerNorm
|
|
else:
|
|
attention_layerNorm = child.attention.output.LayerNorm
|
|
new_module.attn_nw.data = attention_layerNorm.weight
|
|
new_module.attn_nb.data = attention_layerNorm.bias
|
|
if preln:
|
|
intermediate_FF = child.intermediate.dense_act
|
|
else:
|
|
intermediate_FF = child.intermediate.dense
|
|
new_module.inter_w.data = intermediate_FF.weight
|
|
new_module.inter_b.data = intermediate_FF.bias
|
|
new_module.output_w.data = child.output.dense.weight
|
|
new_module.output_b.data = child.output.dense.bias
|
|
if preln:
|
|
transformer_LayerNorm = child.PreAttentionLayerNorm
|
|
else:
|
|
transformer_LayerNorm = child.output.LayerNorm
|
|
new_module.norm_w.data = transformer_LayerNorm.weight
|
|
new_module.norm_b.data = transformer_LayerNorm.bias
|
|
|
|
setattr(model, name, copy.deepcopy(new_module))
|
|
|
|
else:
|
|
module_inject(layer_obj, child, config, micro_batch_size, max_seq_length, seed, preln, fp16)
|
|
|
|
return model
|
|
|
|
|
|
def test_hi():
|
|
from turing.nvidia_modelingpreln import BertConfig as BertConfigPreLN
|
|
from turing.nvidia_modelingpreln import BertForQuestionAnswering as BertForQuestionAnsweringPreLN
|
|
from turing.nvidia_modelingpreln import BertLayer
|
|
bert_model_config = {
|
|
"vocab_size_or_config_json_file": 119547,
|
|
"hidden_size": 1024,
|
|
"num_hidden_layers": 1,
|
|
"num_attention_heads": 16,
|
|
"intermediate_size": 4096,
|
|
"hidden_act": "gelu",
|
|
"hidden_dropout_prob": 0.1,
|
|
"attention_probs_dropout_prob": 0.1,
|
|
"hidden_dropout_prob": 0.1,
|
|
"attention_probs_dropout_prob": 0.1,
|
|
"max_position_embeddings": 512,
|
|
"type_vocab_size": 2,
|
|
"initializer_range": 0.02
|
|
}
|
|
bert_config = BertConfigPreLN(**bert_model_config)
|
|
base_model = BertForQuestionAnsweringPreLN(bert_config, args=None)
|
|
|
|
#base_model = LinearStack()
|
|
|
|
test_model = copy.deepcopy(base_model)
|
|
test_model = module_inject(BertLayer, test_model, bert_config, 4, 384, 1234)
|
|
|
|
print('BASE', base_model)
|
|
print('TEST', test_model)
|
|
|
|
#base_model.eval()
|
|
#test_model.eval()
|
|
|
|
#test_input = torch.rand(1, base_model.input_dim)
|
|
|
|
#base_output = base_model(test_input)
|
|
#test_output = test_model(test_input)
|
|
#
|
|
#assert torch.allclose(base_output, test_output, atol=3e-8)
|