mirror of
https://gitee.com/ascend/MindSpeed-RL.git
synced 2025-10-20 16:23:45 +08:00
!309 【resharding】support EP+vpp+nooplayer
Merge pull request !309 from zhoubeirong/master
This commit is contained in:
5
.gitignore
vendored
5
.gitignore
vendored
@ -157,4 +157,7 @@ cython_debug/
|
|||||||
/kernel*/
|
/kernel*/
|
||||||
/logs/
|
/logs/
|
||||||
/model_from_hf/
|
/model_from_hf/
|
||||||
/model_weights/
|
/model_weights/
|
||||||
|
/outputs/
|
||||||
|
/vllm/
|
||||||
|
/vllm_ascend/
|
@ -54,7 +54,11 @@ MindSpeed RL是基于昇腾生态的强化学习加速框架,旨在为华为 [
|
|||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td>vllm</td>
|
<td>vllm</td>
|
||||||
<td>0.7.3</td>
|
<td>main</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>vllm-ascend</td>
|
||||||
|
<td>main</td>
|
||||||
</tr>
|
</tr>
|
||||||
</table>
|
</table>
|
||||||
|
|
||||||
|
@ -1,244 +0,0 @@
|
|||||||
import os
|
|
||||||
import argparse
|
|
||||||
import logging
|
|
||||||
|
|
||||||
import tensordict
|
|
||||||
import torch
|
|
||||||
import torch_npu
|
|
||||||
from torch_npu.contrib import transfer_to_npu
|
|
||||||
import vllm.distributed.parallel_state as ps
|
|
||||||
|
|
||||||
from mindspeed_rl.models.rollout.vllm_engine import VLLMInferEngine
|
|
||||||
from mindspeed_rl.utils.loggers import Loggers
|
|
||||||
|
|
||||||
logger = Loggers(
|
|
||||||
name="vllm_engine_inference",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def get_args():
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
group = parser.add_argument_group(title='inference args')
|
|
||||||
group.add_argument('--tokenizer-name-or-path', type=str,
|
|
||||||
help="Huggingface config path.")
|
|
||||||
group.add_argument('--load-format', type=str,
|
|
||||||
choices=["auto", "megatron"], default="auto",
|
|
||||||
help="Vllm weight load format, support auto from huggingface and from megatron format.")
|
|
||||||
group.add_argument('--load', type=str,
|
|
||||||
default=None,
|
|
||||||
help="Vllm weight path for megatron load format.")
|
|
||||||
group.add_argument('--tensor-parallel-size', type=int,
|
|
||||||
default=1,
|
|
||||||
help="infer tensor parallel size")
|
|
||||||
group.add_argument('--query', type=str, default="Write an essay about the importance of higher education.",
|
|
||||||
help='Input query.')
|
|
||||||
group.add_argument('--task', type=str,
|
|
||||||
choices=["generation", "chat"], default="chat",
|
|
||||||
help='Inference task, generation or chat.')
|
|
||||||
group.add_argument('--gpu-memory-utilization', type=float, default=0.9,
|
|
||||||
help='Device memory ratio allocated for vllm.')
|
|
||||||
|
|
||||||
group = parser.add_argument_group(title='distributed')
|
|
||||||
group.add_argument('--distributed-backend', default='nccl',
|
|
||||||
choices=['nccl', 'gloo'],
|
|
||||||
help='Which backend to use for distributed training.')
|
|
||||||
group.add_argument('--local-rank', type=int, default=int(os.getenv('LOCAL_RANK', '0')),
|
|
||||||
help='Local rank passed from distributed launcher.')
|
|
||||||
group.add_argument('--prompt-type', type=str, default=None,
|
|
||||||
choices=['default', 'empty', 'trl', 'qwen', 'qwen_r1', "qwen_math_r1", 'llama3', 'mistral', 'mixtral', 'gemma', 'llama2',
|
|
||||||
'alpaca', 'deepseek2', 'deepseek2-lite', 'minicpm3', 'cpm', 'baichuan2', 'deepseek3'],
|
|
||||||
help='Which template to use for constructing prompts in training/inference.' 'e.g., "qwen"')
|
|
||||||
group.add_argument('--prompt-type-path', type=str, default=None,
|
|
||||||
help='Path to the json file of templates.')
|
|
||||||
group = parser.add_argument_group(title='sampling params')
|
|
||||||
group.add_argument('--num-completions', type=int, default=1,
|
|
||||||
help='Number of output sequences to return for the given prompt.')
|
|
||||||
group.add_argument('--logprobs', type=int, default=1,
|
|
||||||
help='Number of log probabilities to return per output token.')
|
|
||||||
group.add_argument('--max-tokens', type=int, default=128,
|
|
||||||
help='Maximum number of tokens to generate per output sequence.')
|
|
||||||
group.add_argument('--top-p', type=float, default=1.0,
|
|
||||||
help='Float that controls the cumulative probability of the top tokens to consider.')
|
|
||||||
group.add_argument('--top-k', type=int, default=-1,
|
|
||||||
help='Integer that controls the number of top tokens to consider. Set to -1 to consider all tokens.')
|
|
||||||
group.add_argument('--temperature', type=float, default=1.0,
|
|
||||||
help='Float that controls the randomness of the sampling.')
|
|
||||||
return parser.parse_args()
|
|
||||||
|
|
||||||
|
|
||||||
def process_outputs(outputs):
|
|
||||||
res = ""
|
|
||||||
for output in outputs:
|
|
||||||
prompt = output.prompt
|
|
||||||
generated_text = output.outputs[0].text
|
|
||||||
res = res + f"Prompt: {prompt!r}\nGenerated Text: {generated_text!r}\n"
|
|
||||||
res = res + "-" * 80
|
|
||||||
return res
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
logger.info("start vllm_engine inference")
|
|
||||||
args = get_args()
|
|
||||||
|
|
||||||
sampling_config = {
|
|
||||||
"num_completions": args.num_completions, # 每个输入提示生成的独立完成项数量
|
|
||||||
"logprobs": args.logprobs, # 返回的 top token 的对数概率数量
|
|
||||||
"max_tokens": args.max_tokens, # 生成输出的最大 token 数量
|
|
||||||
"top_p": args.top_p, # 核采样的累积概率阈值
|
|
||||||
"top_k": args.top_k, # 采样时考虑的最高概率 token 的数量
|
|
||||||
"temperature": args.temperature, # 控制预测随机性的温度参数
|
|
||||||
"detokenize": True # 是否将生成的 token 转换回可读字符串
|
|
||||||
}
|
|
||||||
|
|
||||||
inference_engine = VLLMInferEngine(
|
|
||||||
megatron_config=None,
|
|
||||||
sampling_config=sampling_config,
|
|
||||||
train_expert_parallel_size=1,
|
|
||||||
infer_expert_parallel_size=1,
|
|
||||||
tokenizer_name_or_path=args.tokenizer_name_or_path,
|
|
||||||
prompt_type=args.prompt_type,
|
|
||||||
prompt_type_path=args.prompt_type_path,
|
|
||||||
train_tensor_parallel_size=args.tensor_parallel_size,
|
|
||||||
train_pipeline_parallel_size=1,
|
|
||||||
infer_tensor_parallel_size=args.tensor_parallel_size,
|
|
||||||
infer_pipeline_parallel_size=1,
|
|
||||||
max_num_seqs=1,
|
|
||||||
gpu_memory_utilization=args.gpu_memory_utilization,
|
|
||||||
trust_remote_code=True,
|
|
||||||
load_format=args.load_format
|
|
||||||
)
|
|
||||||
|
|
||||||
if args.load_format == "megatron":
|
|
||||||
tp_rank = ps._TP.rank_in_group
|
|
||||||
weights_path = os.path.join(args.load, f"iter_0000001/mp_rank_{tp_rank:02}/model_optim_rng.pt")
|
|
||||||
|
|
||||||
actor_weights = torch.load(weights_path)['model']
|
|
||||||
actor_weights = replace_state_dict_name(
|
|
||||||
actor_weights,
|
|
||||||
vllm_dict=inference_engine.model.state_dict(),
|
|
||||||
arch=inference_engine.model.__class__.__name__)
|
|
||||||
logger.info("sync_model_weights")
|
|
||||||
inference_engine.sync_model_weights(actor_weights)
|
|
||||||
|
|
||||||
logger.info("init_cache_engine")
|
|
||||||
inference_engine.init_cache_engine()
|
|
||||||
|
|
||||||
if args.task == "chat":
|
|
||||||
chat_task(inference_engine, args.query)
|
|
||||||
elif args.task == "generation":
|
|
||||||
generate_task(inference_engine, args.query)
|
|
||||||
|
|
||||||
|
|
||||||
def chat_task(inference_engine, query):
|
|
||||||
conversation = [
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": query,
|
|
||||||
},
|
|
||||||
]
|
|
||||||
outputs = inference_engine.chat(conversation)
|
|
||||||
res = process_outputs(outputs)
|
|
||||||
logger.info('Query: {}'.format(query))
|
|
||||||
logger.info('Responses:\n{}'.format(res))
|
|
||||||
|
|
||||||
|
|
||||||
def generate_task(inference_engine, query):
|
|
||||||
outputs = inference_engine.llm.generate(
|
|
||||||
prompts=[query],
|
|
||||||
sampling_params=inference_engine.sampling_params,
|
|
||||||
)
|
|
||||||
res = process_outputs(outputs)
|
|
||||||
logger.info('Query: {}'.format(query))
|
|
||||||
logger.info('Responses:\n{}'.format(res))
|
|
||||||
|
|
||||||
|
|
||||||
def replace_state_dict_name(state_dict, vllm_dict, arch=None):
|
|
||||||
params_mapping = [
|
|
||||||
# (megatron core gpt model name, vllm model name)
|
|
||||||
("embedding.word_embeddings", "model.embed_tokens"),
|
|
||||||
("self_attention.linear_qkv", "self_attn.qkv_proj"),
|
|
||||||
("self_attention.linear_proj", "self_attn.o_proj"),
|
|
||||||
("input_layernorm", "input_layernorm"),
|
|
||||||
("pre_mlp_layernorm", "post_attention_layernorm"),
|
|
||||||
("mlp.linear_fc1", "mlp.gate_up_proj"),
|
|
||||||
("mlp.linear_fc2", "mlp.down_proj"),
|
|
||||||
("decoder.final_layernorm", "model.norm"),
|
|
||||||
("output_layer", "lm_head"),
|
|
||||||
# Deepseek add
|
|
||||||
("self_attention.linear_qb", "self_attn.q_b_proj"),
|
|
||||||
("self_attention.linear_kvb", "self_attn.kv_b_proj"),
|
|
||||||
("mlp.router.weight", "mlp.gate.weight"),
|
|
||||||
("mlp.router.expert_bias", "mlp.gate.e_score_correction_bias"),
|
|
||||||
("mlp.shared_experts.linear_fc1", "mlp.shared_experts.gate_up_proj"),
|
|
||||||
("mlp.shared_experts.linear_fc2", "mlp.shared_experts.down_proj"),
|
|
||||||
("mlp.experts.weight1", "mlp.experts.w13_weight"),
|
|
||||||
("mlp.experts.weight2", "mlp.experts.w2_weight"),
|
|
||||||
("self_attention.q_layernorm", "self_attn.q_a_layernorm"),
|
|
||||||
("self_attention.k_layernorm", "self_attn.kv_a_layernorm"),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
new_state_dict = {}
|
|
||||||
for name, loaded_weight in state_dict.items():
|
|
||||||
if "_extra_state" in name:
|
|
||||||
continue
|
|
||||||
if "Deepseek" in arch:
|
|
||||||
name = _replace_name_m2v_deepseek(name, params_mapping)
|
|
||||||
else:
|
|
||||||
name = _replace_name_m2v(name, params_mapping)
|
|
||||||
|
|
||||||
# the router bias in raw weight in fp32
|
|
||||||
if "e_score_correction_bias" in name:
|
|
||||||
loaded_weight = loaded_weight.to(vllm_dict[name].dtype)
|
|
||||||
|
|
||||||
# to adapter 'copy_' in megatron weight loader to save memory
|
|
||||||
if "mlp.experts" in name:
|
|
||||||
loaded_weight = loaded_weight.view(vllm_dict[name].shape)
|
|
||||||
|
|
||||||
new_state_dict[name] = loaded_weight
|
|
||||||
return new_state_dict
|
|
||||||
|
|
||||||
|
|
||||||
def _replace_name_m2v(name, name_mapping):
|
|
||||||
"""
|
|
||||||
Transfer state dict names from megatron to vllm.
|
|
||||||
"""
|
|
||||||
for m_name, v_name in name_mapping:
|
|
||||||
if m_name not in name:
|
|
||||||
continue
|
|
||||||
if "layers" in name: # deal with decoder layers
|
|
||||||
name = name.replace("decoder", "model")
|
|
||||||
name_list = name.split(".")
|
|
||||||
if "layer_norm_weight" in name_list or "layer_norm_bias" in name_list:
|
|
||||||
param_name_list = name_list[:3]
|
|
||||||
param_name_list.append(v_name)
|
|
||||||
param_name = ".".join(param_name_list)
|
|
||||||
else:
|
|
||||||
param_name_list = name_list[:3]
|
|
||||||
weight_or_bias = name_list[-1]
|
|
||||||
param_name_list.append(v_name)
|
|
||||||
param_name_list.append(weight_or_bias)
|
|
||||||
param_name = ".".join(param_name_list)
|
|
||||||
return param_name
|
|
||||||
else:
|
|
||||||
param_name = name.replace(m_name, v_name)
|
|
||||||
return param_name
|
|
||||||
return name
|
|
||||||
|
|
||||||
|
|
||||||
def _replace_name_m2v_deepseek(name, name_mapping):
|
|
||||||
"""
|
|
||||||
Transfer state dict names from megatron to vllm.
|
|
||||||
"""
|
|
||||||
for m_name, v_name in name_mapping:
|
|
||||||
if m_name not in name:
|
|
||||||
continue
|
|
||||||
if "layers" in name: # deal with decoder layers
|
|
||||||
name = name.replace("decoder", "model")
|
|
||||||
param_name = name.replace(m_name, v_name)
|
|
||||||
return param_name
|
|
||||||
return name
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
@ -8,6 +8,15 @@ env_vars:
|
|||||||
HCCL_IF_BASE_PORT: '48000'
|
HCCL_IF_BASE_PORT: '48000'
|
||||||
CUDA_DEVICE_MAX_CONNECTIONS: '1'
|
CUDA_DEVICE_MAX_CONNECTIONS: '1'
|
||||||
HYDRA_FULL_ERROR: '1'
|
HYDRA_FULL_ERROR: '1'
|
||||||
|
VLLM_DP_SIZE: '1'
|
||||||
|
HCCL_BUFFSIZE: '256'
|
||||||
|
VLLM_USE_V1: '1'
|
||||||
|
VLLM_VERSION: '0.9.0'
|
||||||
|
VLLM_ENABLE_GRAPH_MODE: '0'
|
||||||
|
VLLM_ENABLE_MC2: '0'
|
||||||
|
HCCL_OP_EXPANSION_MODE: "AIV"
|
||||||
|
VLLM_ENABLE_TOPK_OPTIMZE: "1"
|
||||||
|
|
||||||
# GLOO_SOCKET_IFNAME: "Your SOCKET IFNAME"
|
# GLOO_SOCKET_IFNAME: "Your SOCKET IFNAME"
|
||||||
# TP_SOCKET_IFNAME: "Your SOCKET IFNAME"
|
# TP_SOCKET_IFNAME: "Your SOCKET IFNAME"
|
||||||
# HCCL_SOCKET_IFNAME: "Your SOCKET IFNAME"
|
# HCCL_SOCKET_IFNAME: "Your SOCKET IFNAME"
|
@ -79,6 +79,7 @@ rl_config:
|
|||||||
guarantee_order: true
|
guarantee_order: true
|
||||||
|
|
||||||
generate_config:
|
generate_config:
|
||||||
|
enforce_eager: True
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
offload_train_optimizer: true
|
offload_train_optimizer: true
|
||||||
offload_train_grad: true
|
offload_train_grad: true
|
||||||
|
@ -80,6 +80,7 @@ rl_config:
|
|||||||
num_npus: 16
|
num_npus: 16
|
||||||
|
|
||||||
generate_config:
|
generate_config:
|
||||||
|
enforce_eager: True
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
offload_train_optimizer: true
|
offload_train_optimizer: true
|
||||||
offload_train_grad: true
|
offload_train_grad: true
|
||||||
|
@ -98,16 +98,16 @@ pip install apex-0.1.dev*.whl
|
|||||||
```shell
|
```shell
|
||||||
git clone https://github.com/vllm-project/vllm.git
|
git clone https://github.com/vllm-project/vllm.git
|
||||||
cd vllm
|
cd vllm
|
||||||
git checkout v0.7.3
|
git checkout 5bc1ad6cee754405464a9957e86cf3a9302e4986
|
||||||
pip install -r requirements-build.txt
|
pip install -r requirements-build.txt
|
||||||
VLLM_TARGET_DEVICE=empty pip install .
|
VLLM_TARGET_DEVICE=empty pip install .
|
||||||
```
|
```
|
||||||
|
|
||||||
### vllm_ascend安装
|
### vllm_ascend安装
|
||||||
```shell
|
```shell
|
||||||
git clone -b v0.7.3-dev https://github.com/vllm-project/vllm-ascend.git
|
git clone -b main https://github.com/vllm-project/vllm-ascend.git
|
||||||
cd vllm-ascend
|
cd vllm-ascend
|
||||||
git checkout 0713836e95fe993feefe334945b5b273e4add1f1
|
git checkout 908a851
|
||||||
pip install -e .
|
pip install -e .
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -1,32 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
export GLOO_SOCKET_IFNAME="Your SOCKET IFNAME"
|
|
||||||
export TP_SOCKET_IFNAME="Your SOCKET IFNAME"
|
|
||||||
export CUDA_DEVICE_MAX_CONNECTIONS=1
|
|
||||||
|
|
||||||
GPUS_PER_NODE=8
|
|
||||||
MASTER_ADDR="host ip"
|
|
||||||
MASTER_PORT=6001
|
|
||||||
NNODES=4
|
|
||||||
NODE_RANK="node rank"
|
|
||||||
|
|
||||||
DISTRIBUTED_ARGS="
|
|
||||||
--nproc_per_node $GPUS_PER_NODE \
|
|
||||||
--nnodes $NNODES \
|
|
||||||
--node_rank $NODE_RANK \
|
|
||||||
--master_addr $MASTER_ADDR \
|
|
||||||
--master_port $MASTER_PORT
|
|
||||||
"
|
|
||||||
|
|
||||||
INFER_ARGS="
|
|
||||||
--tokenizer-name-or-path 'your huggingface config path' \
|
|
||||||
--load-format megatron \
|
|
||||||
--load 'megatron weight path' \
|
|
||||||
--tensor-parallel-size 32 \
|
|
||||||
--task chat \
|
|
||||||
"
|
|
||||||
|
|
||||||
torchrun $DISTRIBUTED_ARGS cli/infer_vllm.py \
|
|
||||||
$INFER_ARGS \
|
|
||||||
--query "Write an essay about the importance of higher education." \
|
|
||||||
--distributed-backend nccl
|
|
@ -25,6 +25,9 @@ class GenerateConfig(BaseConfig):
|
|||||||
dtype: Data type for model weights. Default is "bfloat16".
|
dtype: Data type for model weights. Default is "bfloat16".
|
||||||
gpu_memory_utilization: GPU memory utilization factor. Default is 0.5.
|
gpu_memory_utilization: GPU memory utilization factor. Default is 0.5.
|
||||||
|
|
||||||
|
enforce_eager: Whether to always use eager-mode PyTorch. If True, we will disable ACL graph and always execute the model in eager mode.
|
||||||
|
If False, we will use ACL graph and eager execution in hybrid for maximal performance and flexibility.
|
||||||
|
|
||||||
sampling_config: Configuration for text generation sampling. Default values are set for various sampling parameters.
|
sampling_config: Configuration for text generation sampling. Default values are set for various sampling parameters.
|
||||||
- num_completions: The number of independent completions to generate for each input prompt. Default is 1.
|
- num_completions: The number of independent completions to generate for each input prompt. Default is 1.
|
||||||
- logprobs: The number of top tokens to return log probabilities for. Default is 1.
|
- logprobs: The number of top tokens to return log probabilities for. Default is 1.
|
||||||
@ -72,6 +75,7 @@ class GenerateConfig(BaseConfig):
|
|||||||
|
|
||||||
self.enable_prefix_caching = False
|
self.enable_prefix_caching = False
|
||||||
self.num_scheduler_steps = 1
|
self.num_scheduler_steps = 1
|
||||||
|
self.enforce_eager = False
|
||||||
|
|
||||||
# 采样配置的默认值,用于生成文本时的采样策略设置
|
# 采样配置的默认值,用于生成文本时的采样策略设置
|
||||||
self.sampling_config = {
|
self.sampling_config = {
|
||||||
|
@ -119,6 +119,7 @@ class MegatronConfig(BaseConfig):
|
|||||||
tensor_model_parallel_size: Size of tensor model parallelism (default: 1)
|
tensor_model_parallel_size: Size of tensor model parallelism (default: 1)
|
||||||
pipeline_model_parallel_size: Size of pipeline model parallelism (default: 1)
|
pipeline_model_parallel_size: Size of pipeline model parallelism (default: 1)
|
||||||
expert_model_parallel_size: Degree of expert model parallelism (default: 1)
|
expert_model_parallel_size: Degree of expert model parallelism (default: 1)
|
||||||
|
num_layers_per_virtual_pipeline_stage: Degree of vpp (default: None)
|
||||||
lr: Learning rate (default: None)
|
lr: Learning rate (default: None)
|
||||||
lr_decay_style: Learning rate decay style (default: 'linear')
|
lr_decay_style: Learning rate decay style (default: 'linear')
|
||||||
min_lr: Minimum learning rate (default: 0.0)
|
min_lr: Minimum learning rate (default: 0.0)
|
||||||
@ -198,6 +199,7 @@ class MegatronConfig(BaseConfig):
|
|||||||
reuse_fp32_param: The distributed training optimizer frees up 'param copies of FP32 to save memory. (default: False)
|
reuse_fp32_param: The distributed training optimizer frees up 'param copies of FP32 to save memory. (default: False)
|
||||||
moe_tp_extend_ep: use tp group to extend experts parallelism instead of sharding weight tensor of experts in tp group
|
moe_tp_extend_ep: use tp group to extend experts parallelism instead of sharding weight tensor of experts in tp group
|
||||||
moe_alltoall_overlap_comm: moe_alltoall_overlap_comm
|
moe_alltoall_overlap_comm: moe_alltoall_overlap_comm
|
||||||
|
noop_layers: noop layers string
|
||||||
'''
|
'''
|
||||||
|
|
||||||
def __init__(self, training_config: Dict, model_config: Dict):
|
def __init__(self, training_config: Dict, model_config: Dict):
|
||||||
@ -305,6 +307,7 @@ class MegatronConfig(BaseConfig):
|
|||||||
self.tensor_model_parallel_size = 1
|
self.tensor_model_parallel_size = 1
|
||||||
self.pipeline_model_parallel_size = 1
|
self.pipeline_model_parallel_size = 1
|
||||||
self.expert_model_parallel_size = 1
|
self.expert_model_parallel_size = 1
|
||||||
|
self.num_layers_per_virtual_pipeline_stage = None
|
||||||
self.lr = None
|
self.lr = None
|
||||||
self.lr_decay_style = 'linear'
|
self.lr_decay_style = 'linear'
|
||||||
self.min_lr = 0.0
|
self.min_lr = 0.0
|
||||||
@ -358,5 +361,6 @@ class MegatronConfig(BaseConfig):
|
|||||||
self.overlap_param_gather = False
|
self.overlap_param_gather = False
|
||||||
self.recompute_activation_function = False
|
self.recompute_activation_function = False
|
||||||
self.swap_attention = False
|
self.swap_attention = False
|
||||||
|
self.noop_layers = None
|
||||||
|
|
||||||
self.update(training_config, model_config)
|
self.update(training_config, model_config)
|
||||||
|
@ -29,6 +29,7 @@ def validate_rl_args(
|
|||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"integrated_mode_config should not be set when use_integrated_worker mode is off.")
|
f"integrated_mode_config should not be set when use_integrated_worker mode is off.")
|
||||||
|
|
||||||
|
|
||||||
# 校验序列长度与模型最大长度
|
# 校验序列长度与模型最大长度
|
||||||
if generate_config.max_model_len < actor_config.seq_length:
|
if generate_config.max_model_len < actor_config.seq_length:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
@ -17,6 +17,7 @@ class BaseInferEngine(ABC):
|
|||||||
prompt_type: str = None,
|
prompt_type: str = None,
|
||||||
prompt_type_path: str = None,
|
prompt_type_path: str = None,
|
||||||
train_expert_parallel_size: int = 1,
|
train_expert_parallel_size: int = 1,
|
||||||
|
train_context_parallel_size: int = 1,
|
||||||
infer_tensor_parallel_size: int = 8,
|
infer_tensor_parallel_size: int = 8,
|
||||||
infer_pipeline_parallel_size: int = 1,
|
infer_pipeline_parallel_size: int = 1,
|
||||||
infer_expert_parallel_size: int = 1,
|
infer_expert_parallel_size: int = 1,
|
||||||
@ -34,6 +35,7 @@ class BaseInferEngine(ABC):
|
|||||||
train_tensor_parallel_size (int): Tensor parallel size during training.
|
train_tensor_parallel_size (int): Tensor parallel size during training.
|
||||||
train_pipeline_parallel_size (int): Pipeline parallel size during training.
|
train_pipeline_parallel_size (int): Pipeline parallel size during training.
|
||||||
train_expert_parallel_size (int): Expert parallel size during training.
|
train_expert_parallel_size (int): Expert parallel size during training.
|
||||||
|
train_context_parallel_size (int): Context parallel size during training.
|
||||||
infer_tensor_parallel_size (int): Tensor parallel size during inference.
|
infer_tensor_parallel_size (int): Tensor parallel size during inference.
|
||||||
infer_pipeline_parallel_size (int): Pipeline parallel size during inference.
|
infer_pipeline_parallel_size (int): Pipeline parallel size during inference.
|
||||||
infer_expert_parallel_size (int): Expert parallel size during inference.
|
infer_expert_parallel_size (int): Expert parallel size during inference.
|
||||||
@ -49,6 +51,7 @@ class BaseInferEngine(ABC):
|
|||||||
self.train_tensor_parallel_size = train_tensor_parallel_size
|
self.train_tensor_parallel_size = train_tensor_parallel_size
|
||||||
self.train_pipeline_parallel_size = train_pipeline_parallel_size
|
self.train_pipeline_parallel_size = train_pipeline_parallel_size
|
||||||
self.train_expert_parallel_size = train_expert_parallel_size
|
self.train_expert_parallel_size = train_expert_parallel_size
|
||||||
|
self.train_context_parallel_size = train_context_parallel_size
|
||||||
self.infer_tensor_parallel_size = infer_tensor_parallel_size
|
self.infer_tensor_parallel_size = infer_tensor_parallel_size
|
||||||
self.infer_pipeline_parallel_size = infer_pipeline_parallel_size
|
self.infer_pipeline_parallel_size = infer_pipeline_parallel_size
|
||||||
self.infer_expert_parallel_size = infer_expert_parallel_size
|
self.infer_expert_parallel_size = infer_expert_parallel_size
|
||||||
|
@ -92,6 +92,9 @@ class BaseTrainingEngine(ABC):
|
|||||||
shuffle_mini_batch=self.shuffle_mini_batch)
|
shuffle_mini_batch=self.shuffle_mini_batch)
|
||||||
n_micro_batch = len(batches)
|
n_micro_batch = len(batches)
|
||||||
seq_len = batches[0]['input_ids'].shape[1]
|
seq_len = batches[0]['input_ids'].shape[1]
|
||||||
|
data_iter = iter(batches)
|
||||||
|
if len(self.model) > 1:
|
||||||
|
data_iter = [iter(batches) for _ in self.model]
|
||||||
|
|
||||||
self.loss_func.add_loss_meta_info(self.get_loss_meta_func())
|
self.loss_func.add_loss_meta_info(self.get_loss_meta_func())
|
||||||
|
|
||||||
@ -104,7 +107,7 @@ class BaseTrainingEngine(ABC):
|
|||||||
# batch should be a list of batches inside micro-batches
|
# batch should be a list of batches inside micro-batches
|
||||||
losses_reduced = self.forward_backward_func(
|
losses_reduced = self.forward_backward_func(
|
||||||
forward_step_func=forward_step,
|
forward_step_func=forward_step,
|
||||||
data_iterator=iter(batches),
|
data_iterator=data_iter,
|
||||||
model=self.model,
|
model=self.model,
|
||||||
num_microbatches=n_micro_batch,
|
num_microbatches=n_micro_batch,
|
||||||
seq_length=seq_len,
|
seq_length=seq_len,
|
||||||
|
@ -7,13 +7,6 @@ import torch
|
|||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from transformers.configuration_utils import PretrainedConfig
|
from transformers.configuration_utils import PretrainedConfig
|
||||||
|
|
||||||
from vllm.model_executor.layers.linear import (
|
|
||||||
ColumnParallelLinear, MergedColumnParallelLinear, QKVParallelLinear,
|
|
||||||
RowParallelLinear, ReplicatedLinear)
|
|
||||||
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
|
|
||||||
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead, VocabParallelEmbedding
|
|
||||||
from vllm.model_executor.models import ModelRegistry
|
|
||||||
|
|
||||||
|
|
||||||
class InferParallelConfig:
|
class InferParallelConfig:
|
||||||
def __init__(self, infer_tensor_parallel_size: int, infer_pipeline_parallel_size: int, infer_expert_parallel_size: int):
|
def __init__(self, infer_tensor_parallel_size: int, infer_pipeline_parallel_size: int, infer_expert_parallel_size: int):
|
||||||
@ -88,14 +81,15 @@ def deepseek_megatron_weight_loader(actor_weights: Dict, vllm_model: nn.Module,
|
|||||||
if name not in params_dict.keys():
|
if name not in params_dict.keys():
|
||||||
raise ValueError(f"unexpected key {name} in deepseek_megatron_weight_loader")
|
raise ValueError(f"unexpected key {name} in deepseek_megatron_weight_loader")
|
||||||
if "mlp.experts.w13_weight" in name:
|
if "mlp.experts.w13_weight" in name:
|
||||||
loaded_weight.copy_(loaded_weight.view(hf_config.n_routed_experts, hf_config.hidden_size, -1).transpose(2, 1).contiguous())
|
loaded_weight.copy_(loaded_weight.view(hf_config.n_routed_experts // infer_paralle_config.infer_expert_parallel_size, hf_config.hidden_size, -1).transpose(2, 1).contiguous())
|
||||||
if "mlp.experts.w2_weight" in name:
|
if "mlp.experts.w2_weight" in name:
|
||||||
loaded_weight.copy_(loaded_weight.view(hf_config.n_routed_experts, -1, hf_config.hidden_size).transpose(2, 1).contiguous())
|
loaded_weight.copy_(loaded_weight.view(hf_config.n_routed_experts // infer_paralle_config.infer_expert_parallel_size, -1, hf_config.hidden_size).transpose(2, 1).contiguous())
|
||||||
load_single_weight(params_dict, name, loaded_weight)
|
load_single_weight(params_dict, name, loaded_weight)
|
||||||
return vllm_model
|
return vllm_model
|
||||||
|
|
||||||
|
|
||||||
def _get_model_weight_loader(arch: str):
|
def _get_model_weight_loader(arch: str):
|
||||||
|
from vllm.model_executor.models import ModelRegistry
|
||||||
if arch in MODEL_MEGATRON_WEIGHT_LOADER_REGISTRY:
|
if arch in MODEL_MEGATRON_WEIGHT_LOADER_REGISTRY:
|
||||||
return MODEL_MEGATRON_WEIGHT_LOADER_REGISTRY[arch]
|
return MODEL_MEGATRON_WEIGHT_LOADER_REGISTRY[arch]
|
||||||
raise ValueError(f"Model architectures {arch} are not supported for now. "
|
raise ValueError(f"Model architectures {arch} are not supported for now. "
|
||||||
@ -146,6 +140,23 @@ def load_single_weight(params_dict, name, loaded_weight):
|
|||||||
|
|
||||||
|
|
||||||
def update_megatron_weight_loader():
|
def update_megatron_weight_loader():
|
||||||
|
from vllm.model_executor.layers.linear import (
|
||||||
|
ColumnParallelLinear, MergedColumnParallelLinear, QKVParallelLinear,
|
||||||
|
RowParallelLinear, ReplicatedLinear)
|
||||||
|
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
|
||||||
|
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead, VocabParallelEmbedding
|
||||||
|
|
||||||
|
LAYER_WEIGHT_MEGATRON_LOADER_REGISTRY = {
|
||||||
|
ColumnParallelLinear: parallel_weight_loader,
|
||||||
|
MergedColumnParallelLinear: parallel_weight_loader,
|
||||||
|
QKVParallelLinear: parallel_weight_loader,
|
||||||
|
RowParallelLinear: parallel_weight_loader,
|
||||||
|
VocabParallelEmbedding: parallel_weight_loader,
|
||||||
|
ParallelLMHead: parallel_weight_loader,
|
||||||
|
ReplicatedLinear: parallel_weight_loader,
|
||||||
|
FusedMoE: parallel_weight_loader
|
||||||
|
}
|
||||||
|
|
||||||
for layer_class, weight_loader in LAYER_WEIGHT_MEGATRON_LOADER_REGISTRY.items():
|
for layer_class, weight_loader in LAYER_WEIGHT_MEGATRON_LOADER_REGISTRY.items():
|
||||||
layer_class.weight_loader = weight_loader
|
layer_class.weight_loader = weight_loader
|
||||||
|
|
||||||
@ -176,16 +187,6 @@ MODEL_MEGATRON_WEIGHT_LOADER_REGISTRY = {
|
|||||||
"LlamaForCausalLM": llama_megatron_core_weight_loader,
|
"LlamaForCausalLM": llama_megatron_core_weight_loader,
|
||||||
"Qwen2ForCausalLM": qwen_megatron_weight_loader,
|
"Qwen2ForCausalLM": qwen_megatron_weight_loader,
|
||||||
"DeepseekV3ForCausalLM": deepseek_megatron_weight_loader,
|
"DeepseekV3ForCausalLM": deepseek_megatron_weight_loader,
|
||||||
}
|
"DeepseekV2ForCausalLM": deepseek_megatron_weight_loader,
|
||||||
|
"CustomDeepseekV3ForCausalLM": deepseek_megatron_weight_loader,
|
||||||
|
|
||||||
LAYER_WEIGHT_MEGATRON_LOADER_REGISTRY = {
|
|
||||||
ColumnParallelLinear: parallel_weight_loader,
|
|
||||||
MergedColumnParallelLinear: parallel_weight_loader,
|
|
||||||
QKVParallelLinear: parallel_weight_loader,
|
|
||||||
RowParallelLinear: parallel_weight_loader,
|
|
||||||
VocabParallelEmbedding: parallel_weight_loader,
|
|
||||||
ParallelLMHead: parallel_weight_loader,
|
|
||||||
ReplicatedLinear: parallel_weight_loader,
|
|
||||||
FusedMoE: parallel_weight_loader
|
|
||||||
}
|
}
|
||||||
|
@ -4,11 +4,18 @@
|
|||||||
|
|
||||||
"""Model and data parallel groups."""
|
"""Model and data parallel groups."""
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
|
import socket
|
||||||
|
import subprocess
|
||||||
|
from datetime import timedelta
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.distributed
|
import torch.distributed as dist
|
||||||
import vllm.distributed.parallel_state as ps
|
import vllm.distributed.parallel_state as ps
|
||||||
|
import vllm_ascend.distributed.parallel_state as ascend_ps
|
||||||
|
import vllm.envs as envs
|
||||||
|
from vllm.config import get_current_vllm_config
|
||||||
|
|
||||||
from vllm.distributed.parallel_state import (
|
from vllm.distributed.parallel_state import (
|
||||||
get_pp_group,
|
get_pp_group,
|
||||||
@ -17,6 +24,10 @@ from vllm.distributed.parallel_state import (
|
|||||||
init_model_parallel_group,
|
init_model_parallel_group,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from mindspeed_rl.utils.loggers import Loggers
|
||||||
|
|
||||||
|
logger = Loggers(__name__)
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
This version is strongly tied with Megatron to implement HybridEngine and weight sharing between vllm and Megatron.
|
This version is strongly tied with Megatron to implement HybridEngine and weight sharing between vllm and Megatron.
|
||||||
@ -31,6 +42,12 @@ _DEVICE_MESH = None
|
|||||||
_TP = None
|
_TP = None
|
||||||
# Pipeline model parallel group that the current rank belongs to.
|
# Pipeline model parallel group that the current rank belongs to.
|
||||||
_PP = None
|
_PP = None
|
||||||
|
# Expert model parallel group that the current rank belongs to.
|
||||||
|
_EP = None
|
||||||
|
# Expert tensor model parallel group that the current rank belongs to.
|
||||||
|
_ETP = None
|
||||||
|
# Data model parallel group that the current rank belongs to.
|
||||||
|
_DP = None
|
||||||
|
|
||||||
# Tensor model parallel group
|
# Tensor model parallel group
|
||||||
_TP_GROUP_RANKS = None
|
_TP_GROUP_RANKS = None
|
||||||
@ -42,16 +59,20 @@ def get_vllm_tp_group_ranks():
|
|||||||
|
|
||||||
# This method is for initializing the ParallelGroup when using HybridEngine
|
# This method is for initializing the ParallelGroup when using HybridEngine
|
||||||
def initialize_parallel_state(
|
def initialize_parallel_state(
|
||||||
distributed_init_method: str = "env://",
|
distributed_init_method: str = "env://",
|
||||||
backend: str = "hccl",
|
backend: str = "hccl",
|
||||||
infer_tensor_model_parallel_size: int = 1,
|
infer_tensor_model_parallel_size: int = 1,
|
||||||
train_tensor_model_parallel_size: int = 1,
|
train_tensor_model_parallel_size: int = 1,
|
||||||
infer_pipeline_model_parallel_size: int = 1,
|
infer_pipeline_model_parallel_size: int = 1,
|
||||||
train_pipeline_model_parallel_size: int = 1
|
train_pipeline_model_parallel_size: int = 1,
|
||||||
|
infer_expert_tensor_parallel_size: int = 1,
|
||||||
|
train_expert_tensor_parallel_size: int = 1,
|
||||||
|
train_expert_model_parallel_size: int = 1,
|
||||||
|
infer_expert_model_parallel_size: int = 1,
|
||||||
|
train_context_model_parallel_size: int = 1,
|
||||||
):
|
):
|
||||||
os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
|
os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
|
||||||
|
|
||||||
|
|
||||||
# NOTE(sgm): Modify for verl, Env vars will be set by TORCHRUN.
|
# NOTE(sgm): Modify for verl, Env vars will be set by TORCHRUN.
|
||||||
rank = int(os.getenv("RANK", "-1"))
|
rank = int(os.getenv("RANK", "-1"))
|
||||||
local_rank = int(os.getenv("LOCAL_RANK", "0"))
|
local_rank = int(os.getenv("LOCAL_RANK", "0"))
|
||||||
@ -60,6 +81,8 @@ def initialize_parallel_state(
|
|||||||
world_size = int(os.getenv("WORLD_SIZE", "-1"))
|
world_size = int(os.getenv("WORLD_SIZE", "-1"))
|
||||||
if world_size == -1:
|
if world_size == -1:
|
||||||
raise ValueError("The world_size is set to -1, not initialized by TORCHRUN")
|
raise ValueError("The world_size is set to -1, not initialized by TORCHRUN")
|
||||||
|
config = get_current_vllm_config()
|
||||||
|
config.parallel_config.tensor_parallel_size = infer_tensor_model_parallel_size
|
||||||
init_distributed_environment(world_size, rank, distributed_init_method, local_rank, backend)
|
init_distributed_environment(world_size, rank, distributed_init_method, local_rank, backend)
|
||||||
if torch.distributed.get_world_size() > 1:
|
if torch.distributed.get_world_size() > 1:
|
||||||
# NOTE: build a sepearate inference group with infer tp & micro dp
|
# NOTE: build a sepearate inference group with infer tp & micro dp
|
||||||
@ -67,54 +90,29 @@ def initialize_parallel_state(
|
|||||||
infer_tensor_model_parallel_size=infer_tensor_model_parallel_size,
|
infer_tensor_model_parallel_size=infer_tensor_model_parallel_size,
|
||||||
train_tensor_model_parallel_size=train_tensor_model_parallel_size,
|
train_tensor_model_parallel_size=train_tensor_model_parallel_size,
|
||||||
infer_pipeline_model_parallel_size=infer_pipeline_model_parallel_size,
|
infer_pipeline_model_parallel_size=infer_pipeline_model_parallel_size,
|
||||||
train_pipeline_model_parallel_size=train_pipeline_model_parallel_size
|
train_pipeline_model_parallel_size=train_pipeline_model_parallel_size,
|
||||||
|
infer_expert_tensor_parallel_size=infer_expert_tensor_parallel_size,
|
||||||
|
train_expert_tensor_parallel_size=train_expert_tensor_parallel_size,
|
||||||
|
train_expert_model_parallel_size=train_expert_model_parallel_size,
|
||||||
|
infer_expert_model_parallel_size=infer_expert_model_parallel_size,
|
||||||
|
train_context_model_parallel_size=train_context_model_parallel_size
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
initialize_model_parallel(infer_tensor_model_parallel_size, infer_pipeline_model_parallel_size, backend)
|
initialize_model_parallel(infer_tensor_model_parallel_size, infer_pipeline_model_parallel_size, backend)
|
||||||
|
|
||||||
|
|
||||||
def ensure_model_parallel_initialized(
|
|
||||||
tensor_model_parallel_size: int,
|
|
||||||
pipeline_model_parallel_size: int = 1,
|
|
||||||
backend: Optional[str] = None,
|
|
||||||
) -> None:
|
|
||||||
"""Helper to initialize model parallel groups if they are not initialized,
|
|
||||||
or ensure tensor-parallel and pipeline-parallel sizes are equal to expected
|
|
||||||
values if the model parallel groups are initialized.
|
|
||||||
"""
|
|
||||||
# get the backend of _DEVICE_WORLD_GROUP
|
|
||||||
backend = backend or torch.distributed.get_backend(get_world_group().device_group)
|
|
||||||
if not model_parallel_is_initialized():
|
|
||||||
initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size, backend)
|
|
||||||
return
|
|
||||||
|
|
||||||
current_tp_size = get_tensor_model_parallel_world_size()
|
|
||||||
if current_tp_size != tensor_model_parallel_size:
|
|
||||||
raise ValueError(
|
|
||||||
"tensor parallel group already initialized, but of unexpected size: "
|
|
||||||
f"{current_tp_size=} vs. "
|
|
||||||
f"{tensor_model_parallel_size=}"
|
|
||||||
)
|
|
||||||
pp_world_size = get_pp_group().world_size
|
|
||||||
if pp_world_size != pipeline_model_parallel_size:
|
|
||||||
raise ValueError(
|
|
||||||
"pipeline parallel group already initialized, but of unexpected size: "
|
|
||||||
f"{pp_world_size=} vs. "
|
|
||||||
f"{pipeline_model_parallel_size=}"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def model_parallel_is_initialized():
|
|
||||||
"""Check if tensor and pipeline parallel groups are initialized."""
|
|
||||||
return ps._TP is not None
|
|
||||||
# and _PIPELINE_MODEL_PARALLEL_GROUP is not None)
|
|
||||||
|
|
||||||
|
|
||||||
def initialize_model_parallel_for_vllm(
|
def initialize_model_parallel_for_vllm(
|
||||||
infer_tensor_model_parallel_size: int,
|
infer_tensor_model_parallel_size: int,
|
||||||
train_tensor_model_parallel_size: int = 1,
|
train_tensor_model_parallel_size: int = 1,
|
||||||
infer_pipeline_model_parallel_size: int = 1,
|
infer_pipeline_model_parallel_size: int = 1,
|
||||||
train_pipeline_model_parallel_size: int = 1
|
train_pipeline_model_parallel_size: int = 1,
|
||||||
|
infer_expert_tensor_parallel_size: int = 1,
|
||||||
|
train_expert_tensor_parallel_size: int = 1,
|
||||||
|
train_expert_model_parallel_size: int = 1,
|
||||||
|
infer_expert_model_parallel_size: int = 1,
|
||||||
|
train_context_model_parallel_size: int = 1,
|
||||||
|
num_process: int = 1,
|
||||||
|
rebulid_EP_group: bool = False
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|
||||||
# Get world size and rank. Ensure some consistencies.
|
# Get world size and rank. Ensure some consistencies.
|
||||||
@ -149,8 +147,10 @@ def initialize_model_parallel_for_vllm(
|
|||||||
Returns: list of group_lists
|
Returns: list of group_lists
|
||||||
[[g0, g1], [g2, g3], [g4, g5], [g6, g7]]
|
[[g0, g1], [g2, g3], [g4, g5], [g6, g7]]
|
||||||
'''
|
'''
|
||||||
if ((world_size // (train_tensor_model_parallel_size * train_pipeline_model_parallel_size)) * train_tensor_model_parallel_size < infer_tensor_model_parallel_size or
|
if ((world_size // (
|
||||||
((world_size // (train_tensor_model_parallel_size * train_pipeline_model_parallel_size)) * train_tensor_model_parallel_size) % infer_tensor_model_parallel_size != 0):
|
train_tensor_model_parallel_size * train_pipeline_model_parallel_size)) * train_tensor_model_parallel_size < infer_tensor_model_parallel_size or
|
||||||
|
((world_size // (
|
||||||
|
train_tensor_model_parallel_size * train_pipeline_model_parallel_size)) * train_tensor_model_parallel_size) % infer_tensor_model_parallel_size != 0):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Can't split train tp size {train_tensor_model_parallel_size} to infer tp size {infer_tensor_model_parallel_size} "
|
f"Can't split train tp size {train_tensor_model_parallel_size} to infer tp size {infer_tensor_model_parallel_size} "
|
||||||
f"with train dp size {(world_size // (train_tensor_model_parallel_size * train_pipeline_model_parallel_size))}.")
|
f"with train dp size {(world_size // (train_tensor_model_parallel_size * train_pipeline_model_parallel_size))}.")
|
||||||
@ -179,16 +179,13 @@ def initialize_model_parallel_for_vllm(
|
|||||||
[[g0, g2], [g1, g3], [g4, g6], [g5, g7]]
|
[[g0, g2], [g1, g3], [g4, g6], [g5, g7]]
|
||||||
'''
|
'''
|
||||||
if train_tensor_model_parallel_size < infer_tensor_model_parallel_size or train_tensor_model_parallel_size % infer_tensor_model_parallel_size != 0:
|
if train_tensor_model_parallel_size < infer_tensor_model_parallel_size or train_tensor_model_parallel_size % infer_tensor_model_parallel_size != 0:
|
||||||
raise ValueError(f"Can't gather train tp size {train_tensor_model_parallel_size} to infer tp size {infer_tensor_model_parallel_size}")
|
raise ValueError(
|
||||||
|
f"Can't gather train tp size {train_tensor_model_parallel_size} to infer tp size {infer_tensor_model_parallel_size}")
|
||||||
num_tensor_model_parallel_groups = world_size // infer_tensor_model_parallel_size
|
num_tensor_model_parallel_groups = world_size // infer_tensor_model_parallel_size
|
||||||
num_tensor_model_parallel_groups_per_train_tp = train_tensor_model_parallel_size // infer_tensor_model_parallel_size
|
|
||||||
group_ranks = []
|
group_ranks = []
|
||||||
for i in range(num_tensor_model_parallel_groups // num_tensor_model_parallel_groups_per_train_tp):
|
for i in range(num_tensor_model_parallel_groups):
|
||||||
start = train_tensor_model_parallel_size * i
|
ranks = list(range(i * infer_tensor_model_parallel_size, (i + 1) * infer_tensor_model_parallel_size))
|
||||||
end = train_tensor_model_parallel_size * (i + 1)
|
group_ranks.append(ranks)
|
||||||
for j in range(num_tensor_model_parallel_groups_per_train_tp):
|
|
||||||
ranks = list(range(start + j, end, num_tensor_model_parallel_groups_per_train_tp))
|
|
||||||
group_ranks.append(ranks)
|
|
||||||
|
|
||||||
return group_ranks
|
return group_ranks
|
||||||
|
|
||||||
@ -201,9 +198,11 @@ def initialize_model_parallel_for_vllm(
|
|||||||
_TP_GROUP_RANKS = tp_group_ranks
|
_TP_GROUP_RANKS = tp_group_ranks
|
||||||
return tp_group_ranks
|
return tp_group_ranks
|
||||||
|
|
||||||
|
tp_group_ranks = get_tp_group_ranks()
|
||||||
|
logger.info(f"TP rank: {tp_group_ranks}")
|
||||||
|
|
||||||
_TP = init_model_parallel_group(
|
_TP = init_model_parallel_group(
|
||||||
group_ranks=get_tp_group_ranks(),
|
group_ranks=tp_group_ranks,
|
||||||
local_rank=get_world_group().local_rank,
|
local_rank=get_world_group().local_rank,
|
||||||
backend=backend,
|
backend=backend,
|
||||||
use_message_queue_broadcaster=True,
|
use_message_queue_broadcaster=True,
|
||||||
@ -218,16 +217,134 @@ def initialize_model_parallel_for_vllm(
|
|||||||
ranks = list(range(i, world_size, num_pipeline_model_parallel_groups))
|
ranks = list(range(i, world_size, num_pipeline_model_parallel_groups))
|
||||||
group_ranks.append(ranks)
|
group_ranks.append(ranks)
|
||||||
# pipeline parallel does not need custom allreduce
|
# pipeline parallel does not need custom allreduce
|
||||||
|
logger.info(f"PP rank: {group_ranks}")
|
||||||
_PP = init_model_parallel_group(
|
_PP = init_model_parallel_group(
|
||||||
group_ranks, get_world_group().local_rank, backend,
|
group_ranks, get_world_group().local_rank, backend,
|
||||||
)
|
)
|
||||||
ps._PP = _PP # for verl
|
ps._PP = _PP # for verl
|
||||||
|
|
||||||
|
data_parallel_size = 1
|
||||||
|
from vllm.config import get_current_vllm_config
|
||||||
|
config = get_current_vllm_config()
|
||||||
|
|
||||||
|
if config is not None:
|
||||||
|
data_parallel_size = config.parallel_config.data_parallel_size
|
||||||
|
|
||||||
|
num_expert_parallel_groups: int = infer_expert_tensor_parallel_size
|
||||||
|
num_expert_tensor_parallel_groups: int = world_size // infer_expert_tensor_parallel_size
|
||||||
|
|
||||||
|
num_rank_per_process = world_size // num_process
|
||||||
|
all_ranks = list(range(world_size))
|
||||||
|
|
||||||
|
global _EP
|
||||||
|
assert _EP is None, ("expert parallel group is already initialized")
|
||||||
|
group_ranks = []
|
||||||
|
|
||||||
|
if rebulid_EP_group:
|
||||||
|
# 重新建组
|
||||||
|
tensor_model_parallel_size = train_tensor_model_parallel_size
|
||||||
|
context_parallel_size = train_context_model_parallel_size
|
||||||
|
expert_model_parallel_size = train_expert_model_parallel_size
|
||||||
|
train_data_parallel_size = world_size // tensor_model_parallel_size // train_pipeline_model_parallel_size
|
||||||
|
tensor_and_data_group_size_with_cp: int = tensor_model_parallel_size * train_data_parallel_size * context_parallel_size
|
||||||
|
num_tensor_and_data_groups_with_cp: int = world_size // tensor_and_data_group_size_with_cp
|
||||||
|
num_expert_groups: int = train_data_parallel_size * context_parallel_size // expert_model_parallel_size
|
||||||
|
tensor_and_expert_group_size = tensor_model_parallel_size * expert_model_parallel_size
|
||||||
|
all_tensor_and_expert_group_ranks = []
|
||||||
|
for i in range(num_tensor_and_data_groups_with_cp):
|
||||||
|
for j in range(num_expert_groups):
|
||||||
|
start_rank = i * tensor_and_data_group_size_with_cp + j * tensor_and_expert_group_size
|
||||||
|
end_rank = i * tensor_and_data_group_size_with_cp + (j + 1) * tensor_and_expert_group_size
|
||||||
|
ranks = range(start_rank, end_rank)
|
||||||
|
all_tensor_and_expert_group_ranks.append(list(ranks))
|
||||||
|
train_all_tensor_and_expert_group_ranks_tensor = torch.tensor(all_tensor_and_expert_group_ranks)
|
||||||
|
# 将训练态的EPG按照推理EP进行转置
|
||||||
|
infer_actual_expert_model_parallel_size = infer_tensor_model_parallel_size * infer_expert_model_parallel_size
|
||||||
|
experts_memory_expend_N = infer_actual_expert_model_parallel_size // tensor_and_expert_group_size
|
||||||
|
ep_group_num = world_size // tensor_and_expert_group_size
|
||||||
|
group_ranks = []
|
||||||
|
for i in range(0, ep_group_num, experts_memory_expend_N):
|
||||||
|
per_ep_group = train_all_tensor_and_expert_group_ranks_tensor[i:i + experts_memory_expend_N]
|
||||||
|
per_ep_group_T = per_ep_group.T
|
||||||
|
ranks = per_ep_group_T.reshape(-1).tolist()
|
||||||
|
group_ranks.append(ranks)
|
||||||
|
logger.info(f"EP rank: {group_ranks}")
|
||||||
|
|
||||||
|
else:
|
||||||
|
# 保序
|
||||||
|
group_ranks = []
|
||||||
|
tensor_model_parallel_size = infer_tensor_model_parallel_size
|
||||||
|
context_parallel_size = 1
|
||||||
|
expert_model_parallel_size = infer_expert_model_parallel_size
|
||||||
|
infer_data_parallel_size = world_size // tensor_model_parallel_size // infer_pipeline_model_parallel_size
|
||||||
|
tensor_and_data_group_size_with_cp: int = tensor_model_parallel_size * infer_data_parallel_size * context_parallel_size
|
||||||
|
num_tensor_and_data_groups_with_cp: int = world_size // tensor_and_data_group_size_with_cp
|
||||||
|
num_expert_groups: int = infer_data_parallel_size * context_parallel_size // expert_model_parallel_size
|
||||||
|
tensor_and_expert_group_size = tensor_model_parallel_size * expert_model_parallel_size
|
||||||
|
group_ranks = []
|
||||||
|
for i in range(num_tensor_and_data_groups_with_cp):
|
||||||
|
for j in range(num_expert_groups):
|
||||||
|
start_rank = i * tensor_and_data_group_size_with_cp + j * tensor_and_expert_group_size
|
||||||
|
end_rank = i * tensor_and_data_group_size_with_cp + (j + 1) * tensor_and_expert_group_size
|
||||||
|
ranks = range(start_rank, end_rank)
|
||||||
|
group_ranks.append(list(ranks))
|
||||||
|
logger.info(f"EP rank: {group_ranks}")
|
||||||
|
|
||||||
|
ascend_ps._EP = init_model_parallel_group(group_ranks,
|
||||||
|
get_world_group().local_rank,
|
||||||
|
backend,
|
||||||
|
group_name="ep")
|
||||||
|
|
||||||
|
global _ETP
|
||||||
|
assert _ETP is None, (
|
||||||
|
"expert tensor parallel group is already initialized")
|
||||||
|
|
||||||
|
group_ranks = []
|
||||||
|
for i in range(num_expert_tensor_parallel_groups):
|
||||||
|
ranks = list(range(i * infer_expert_tensor_parallel_size,
|
||||||
|
(i + 1) * infer_expert_tensor_parallel_size))
|
||||||
|
group_ranks.append(ranks)
|
||||||
|
logger.info(f"ETP rank: {group_ranks}")
|
||||||
|
|
||||||
|
ascend_ps._ETP = init_model_parallel_group(group_ranks,
|
||||||
|
get_world_group().local_rank,
|
||||||
|
backend,
|
||||||
|
group_name="etp")
|
||||||
|
|
||||||
|
if data_parallel_size > 1:
|
||||||
|
global _DP
|
||||||
|
assert _DP is None, ("data parallel group is already initialized")
|
||||||
|
dp_group_ranks = torch.tensor(tp_group_ranks).transpose(0, 1).reshape(-1, data_parallel_size).unbind(0)
|
||||||
|
group_ranks = [x.tolist() for x in dp_group_ranks]
|
||||||
|
logger.info(f"DP rank: {group_ranks}")
|
||||||
|
|
||||||
|
ps._DP = init_model_parallel_group(group_ranks,
|
||||||
|
get_world_group().local_rank,
|
||||||
|
backend,
|
||||||
|
group_name="dp")
|
||||||
|
|
||||||
|
os.environ["VLLM_DP_RANK"] = str(ps._DP.rank_in_group)
|
||||||
|
envs.VLLM_DP_RANK = int(os.environ["VLLM_DP_RANK"])
|
||||||
|
ip_list = get_cluster_info()
|
||||||
|
|
||||||
|
for index, group_rank in enumerate(group_ranks):
|
||||||
|
if torch.distributed.get_rank() in group_rank:
|
||||||
|
os.environ["VLLM_DP_MASTER_PORT"] = str(
|
||||||
|
int(os.environ.get("MASTER_PORT")) + 1 + index)
|
||||||
|
os.environ["VLLM_DP_MASTER_IP"] = ip_list[group_rank[0]]
|
||||||
|
|
||||||
|
envs.VLLM_DP_MASTER_IP = os.environ["VLLM_DP_MASTER_IP"]
|
||||||
|
envs.VLLM_DP_MASTER_PORT = int(os.environ["VLLM_DP_MASTER_PORT"])
|
||||||
|
os.environ["VLLM_PORT"] = os.environ["VLLM_DP_MASTER_PORT"]
|
||||||
|
envs.VLLM_PORT = envs.VLLM_DP_MASTER_PORT
|
||||||
|
|
||||||
|
logger.info(f"rank: {torch.distributed.get_rank()}>>>>>>VLLM_DP_MASTER_IP: {envs.VLLM_DP_MASTER_IP}, VLLM_DP_MASTER_PORT: {envs.VLLM_DP_MASTER_PORT}")
|
||||||
|
|
||||||
|
|
||||||
def initialize_model_parallel(
|
def initialize_model_parallel(
|
||||||
tensor_model_parallel_size: int = 1,
|
tensor_model_parallel_size: int = 1,
|
||||||
pipeline_model_parallel_size: int = 1,
|
pipeline_model_parallel_size: int = 1,
|
||||||
backend: Optional[str] = None,
|
backend: Optional[str] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""
|
"""
|
||||||
NOTE: This method is a hack from the open-sourced version without
|
NOTE: This method is a hack from the open-sourced version without
|
||||||
@ -260,8 +377,6 @@ def initialize_model_parallel(
|
|||||||
world_size: int = torch.distributed.get_world_size()
|
world_size: int = torch.distributed.get_world_size()
|
||||||
backend = backend or torch.distributed.get_backend(ps.get_world_group().device_group)
|
backend = backend or torch.distributed.get_backend(ps.get_world_group().device_group)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
num_tensor_model_parallel_groups: int = world_size // tensor_model_parallel_size
|
num_tensor_model_parallel_groups: int = world_size // tensor_model_parallel_size
|
||||||
global _TP
|
global _TP
|
||||||
if _TP is not None:
|
if _TP is not None:
|
||||||
@ -295,3 +410,60 @@ def initialize_model_parallel(
|
|||||||
|
|
||||||
ps._PP = _PP # for verl
|
ps._PP = _PP # for verl
|
||||||
|
|
||||||
|
|
||||||
|
def get_cluster_info():
|
||||||
|
# 确保分布式环境已初始化
|
||||||
|
if not dist.is_initialized():
|
||||||
|
raise RuntimeError("Distributed environment not initialized")
|
||||||
|
|
||||||
|
world_size = dist.get_world_size()
|
||||||
|
|
||||||
|
# 获取当前节点的IP地址
|
||||||
|
ip_address = _get_current_node_ip()
|
||||||
|
|
||||||
|
# 收集所有rank的IP地址
|
||||||
|
ip_list = [None] * world_size
|
||||||
|
dist.all_gather_object(ip_list, ip_address)
|
||||||
|
|
||||||
|
return ip_list
|
||||||
|
|
||||||
|
|
||||||
|
def _get_current_node_ip() -> str:
|
||||||
|
try:
|
||||||
|
# 创建一个 UDP 套接字(仅用于获取接口信息)
|
||||||
|
with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
|
||||||
|
# 连接到一个外部地址(无需真实通信)
|
||||||
|
s.connect(("8.8.8.8", 80)) # Google DNS 服务器
|
||||||
|
local_ip = s.getsockname()[0]
|
||||||
|
except Exception:
|
||||||
|
local_ip = _get_ip_by_ifname()
|
||||||
|
if not local_ip:
|
||||||
|
# 如果失败,回退到遍历接口
|
||||||
|
local_ip = "127.0.0.1"
|
||||||
|
hostname = socket.gethostname()
|
||||||
|
for addr in socket.getaddrinfo(hostname, None):
|
||||||
|
ip = addr[4][0]
|
||||||
|
if not ip.startswith("::"):
|
||||||
|
local_ip = ip
|
||||||
|
break
|
||||||
|
return local_ip
|
||||||
|
|
||||||
|
|
||||||
|
def _get_ip_by_ifname():
|
||||||
|
"""
|
||||||
|
通过接口名称(如 eth0、en0)获取 IPv4 地址
|
||||||
|
返回 IP 字符串,失败返回 None
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# 执行 ifconfig 命令并捕获输出
|
||||||
|
ifname = os.environ.get("HCCL_SOCKET_IFNAME", 0)
|
||||||
|
if ifname:
|
||||||
|
output = subprocess.check_output(["ifconfig", ifname], stderr=subprocess.STDOUT).decode()
|
||||||
|
# 正则匹配 IPv4 地址(排除 127.0.0.1)
|
||||||
|
matches = re.findall(r'inet (?:addr:)?((?:\d{1,3}\.){3}\d{1,3})', output)
|
||||||
|
for ip in matches:
|
||||||
|
if ip != "127.0.0.1":
|
||||||
|
return ip
|
||||||
|
return None
|
||||||
|
except subprocess.CalledProcessError:
|
||||||
|
return None
|
||||||
|
@ -9,10 +9,10 @@ import gc
|
|||||||
import ray
|
import ray
|
||||||
import torch
|
import torch
|
||||||
import torch.distributed
|
import torch.distributed
|
||||||
|
import torch_npu
|
||||||
|
from torch_npu.contrib import transfer_to_npu
|
||||||
from torch.nn.utils.rnn import pad_sequence
|
from torch.nn.utils.rnn import pad_sequence
|
||||||
from transformers import AutoConfig
|
from transformers import AutoConfig
|
||||||
from vllm import LLM, SamplingParams
|
|
||||||
from mindspeed_rl.utils.utils import mstx_timer_decorator
|
|
||||||
|
|
||||||
|
|
||||||
def dummy_compile(*compile_args, **compile_kwargs):
|
def dummy_compile(*compile_args, **compile_kwargs):
|
||||||
@ -23,14 +23,14 @@ def dummy_compile(*compile_args, **compile_kwargs):
|
|||||||
return wrapper
|
return wrapper
|
||||||
|
|
||||||
return decorate
|
return decorate
|
||||||
|
|
||||||
|
|
||||||
torch.jit.script = dummy_compile
|
|
||||||
torch.compile = dummy_compile
|
torch.compile = dummy_compile
|
||||||
|
torch.jit.script = dummy_compile
|
||||||
|
|
||||||
|
from vllm import LLM, SamplingParams
|
||||||
|
from vllm.v1.core.kv_cache_utils import get_kv_cache_config, unify_kv_cache_configs
|
||||||
from mindspeed_rl.utils.loggers import Loggers
|
from mindspeed_rl.utils.loggers import Loggers
|
||||||
from mindspeed_rl.models.base.base_inference_engine import BaseInferEngine
|
from mindspeed_rl.models.base.base_inference_engine import BaseInferEngine
|
||||||
from mindspeed_rl.config_cls.megatron_config import MegatronConfig
|
|
||||||
from mindspeed_rl.models.rollout.vllm_adapter.vllm_parallel_state import initialize_parallel_state
|
from mindspeed_rl.models.rollout.vllm_adapter.vllm_parallel_state import initialize_parallel_state
|
||||||
from mindspeed_rl.models.rollout.vllm_adapter.megatron_weight_loaders import (
|
from mindspeed_rl.models.rollout.vllm_adapter.megatron_weight_loaders import (
|
||||||
load_megatron_weights,
|
load_megatron_weights,
|
||||||
@ -49,11 +49,12 @@ class VLLMInferEngine(BaseInferEngine):
|
|||||||
train_tensor_parallel_size: int,
|
train_tensor_parallel_size: int,
|
||||||
train_pipeline_parallel_size: int,
|
train_pipeline_parallel_size: int,
|
||||||
train_expert_parallel_size: int,
|
train_expert_parallel_size: int,
|
||||||
|
train_context_parallel_size: int,
|
||||||
infer_tensor_parallel_size: int,
|
infer_tensor_parallel_size: int,
|
||||||
infer_pipeline_parallel_size: int,
|
infer_pipeline_parallel_size: int,
|
||||||
infer_expert_parallel_size: int,
|
infer_expert_parallel_size: int,
|
||||||
megatron_config: MegatronConfig,
|
|
||||||
sampling_config: dict,
|
sampling_config: dict,
|
||||||
|
infer_expert_tensor_parallel_size: int = 1,
|
||||||
prompt_type: str = None,
|
prompt_type: str = None,
|
||||||
prompt_type_path: str = None,
|
prompt_type_path: str = None,
|
||||||
enable_prefix_caching: bool = False,
|
enable_prefix_caching: bool = False,
|
||||||
@ -64,6 +65,7 @@ class VLLMInferEngine(BaseInferEngine):
|
|||||||
gpu_memory_utilization: float = 0.5,
|
gpu_memory_utilization: float = 0.5,
|
||||||
trust_remote_code: bool = True,
|
trust_remote_code: bool = True,
|
||||||
load_format: str = "megatron",
|
load_format: str = "megatron",
|
||||||
|
enforce_eager: bool = False,
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
@ -74,10 +76,12 @@ class VLLMInferEngine(BaseInferEngine):
|
|||||||
train_tensor_parallel_size (int): Tensor parallel size during training.
|
train_tensor_parallel_size (int): Tensor parallel size during training.
|
||||||
train_pipeline_parallel_size (int): Pipeline parallel size during training.
|
train_pipeline_parallel_size (int): Pipeline parallel size during training.
|
||||||
train_expert_parallel_size (int): Expert parallel size during training.
|
train_expert_parallel_size (int): Expert parallel size during training.
|
||||||
|
train_context_parallel_size (int): Context parallel size during training.
|
||||||
infer_tensor_parallel_size (int): Tensor parallel size during inference.
|
infer_tensor_parallel_size (int): Tensor parallel size during inference.
|
||||||
infer_pipeline_parallel_size (int): Pipeline parallel size during inference.
|
infer_pipeline_parallel_size (int): Pipeline parallel size during inference.
|
||||||
infer_expert_parallel_size (int): Expert parallel size during inference.
|
infer_expert_parallel_size (int): Expert parallel size during inference.
|
||||||
sampling_config (dict): Configuration for text generation sampling.
|
sampling_config (dict): Configuration for text generation sampling.
|
||||||
|
infer_expert_tensor_parallel_size (int): Expert tensor parallel size during inference.
|
||||||
enable_prefix_caching (bool): Whether to enable prefix caching.
|
enable_prefix_caching (bool): Whether to enable prefix caching.
|
||||||
num_scheduler_steps (int): Num scheduler steps. Default is 1.
|
num_scheduler_steps (int): Num scheduler steps. Default is 1.
|
||||||
max_num_seqs (int): Maximum number of sequences to process simultaneously. Default is 1.
|
max_num_seqs (int): Maximum number of sequences to process simultaneously. Default is 1.
|
||||||
@ -95,6 +99,7 @@ class VLLMInferEngine(BaseInferEngine):
|
|||||||
train_tensor_parallel_size=train_tensor_parallel_size,
|
train_tensor_parallel_size=train_tensor_parallel_size,
|
||||||
train_pipeline_parallel_size=train_pipeline_parallel_size,
|
train_pipeline_parallel_size=train_pipeline_parallel_size,
|
||||||
train_expert_parallel_size=train_expert_parallel_size,
|
train_expert_parallel_size=train_expert_parallel_size,
|
||||||
|
train_context_parallel_size=train_context_parallel_size,
|
||||||
infer_tensor_parallel_size=infer_tensor_parallel_size,
|
infer_tensor_parallel_size=infer_tensor_parallel_size,
|
||||||
infer_pipeline_parallel_size=infer_pipeline_parallel_size,
|
infer_pipeline_parallel_size=infer_pipeline_parallel_size,
|
||||||
infer_expert_parallel_size=infer_expert_parallel_size,
|
infer_expert_parallel_size=infer_expert_parallel_size,
|
||||||
@ -105,6 +110,12 @@ class VLLMInferEngine(BaseInferEngine):
|
|||||||
trust_remote_code=trust_remote_code
|
trust_remote_code=trust_remote_code
|
||||||
)
|
)
|
||||||
# Additional initialization logic for VLLMInferEngine
|
# Additional initialization logic for VLLMInferEngine
|
||||||
|
|
||||||
|
torch.compile = dummy_compile
|
||||||
|
# vLLM Ascend must be patched in advance
|
||||||
|
from vllm_ascend.patch import platform
|
||||||
|
from vllm_ascend.patch import worker
|
||||||
|
|
||||||
# Initialize sampling parameters from SamplingConfig
|
# Initialize sampling parameters from SamplingConfig
|
||||||
self.sampling_config = sampling_config
|
self.sampling_config = sampling_config
|
||||||
try:
|
try:
|
||||||
@ -112,13 +123,11 @@ class VLLMInferEngine(BaseInferEngine):
|
|||||||
n=sampling_config.get('num_completions', 1),
|
n=sampling_config.get('num_completions', 1),
|
||||||
logprobs=sampling_config.get('logprobs', 1),
|
logprobs=sampling_config.get('logprobs', 1),
|
||||||
max_tokens=sampling_config.get('max_tokens', 128),
|
max_tokens=sampling_config.get('max_tokens', 128),
|
||||||
best_of=sampling_config.get('best_of', 2),
|
|
||||||
top_p=sampling_config.get('top_p', 1.0),
|
top_p=sampling_config.get('top_p', 1.0),
|
||||||
top_k=sampling_config.get('top_k', 50),
|
top_k=sampling_config.get('top_k', 50),
|
||||||
min_p=sampling_config.get('min_p', 0.0),
|
min_p=sampling_config.get('min_p', 0.0),
|
||||||
temperature=sampling_config.get('temperature', 0.2),
|
temperature=sampling_config.get('temperature', 0.2),
|
||||||
detokenize=sampling_config.get('detokenize', False),
|
detokenize=sampling_config.get('detokenize', False)
|
||||||
seed=sampling_config.get('seed', None)
|
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise ValueError(f"Error creating SamplingParams from dictionary") from e
|
raise ValueError(f"Error creating SamplingParams from dictionary") from e
|
||||||
@ -139,7 +148,6 @@ class VLLMInferEngine(BaseInferEngine):
|
|||||||
|
|
||||||
# Initialize parallel state if tensor parallel size is specified
|
# Initialize parallel state if tensor parallel size is specified
|
||||||
if train_tensor_parallel_size is not None:
|
if train_tensor_parallel_size is not None:
|
||||||
num_tp_per_train_tp = train_tensor_parallel_size // infer_tensor_parallel_size
|
|
||||||
os.environ['CUDA_TIMER_STREAM_KAFKA_ENABLE'] = '0'
|
os.environ['CUDA_TIMER_STREAM_KAFKA_ENABLE'] = '0'
|
||||||
os.environ['MEGATRON_IMPORT_TIMERS'] = '0'
|
os.environ['MEGATRON_IMPORT_TIMERS'] = '0'
|
||||||
initialize_parallel_state(
|
initialize_parallel_state(
|
||||||
@ -147,32 +155,39 @@ class VLLMInferEngine(BaseInferEngine):
|
|||||||
train_tensor_model_parallel_size=train_tensor_parallel_size,
|
train_tensor_model_parallel_size=train_tensor_parallel_size,
|
||||||
infer_pipeline_model_parallel_size=infer_pipeline_parallel_size,
|
infer_pipeline_model_parallel_size=infer_pipeline_parallel_size,
|
||||||
train_pipeline_model_parallel_size=train_pipeline_parallel_size,
|
train_pipeline_model_parallel_size=train_pipeline_parallel_size,
|
||||||
|
train_expert_model_parallel_size=train_expert_parallel_size,
|
||||||
|
infer_expert_model_parallel_size=infer_expert_parallel_size,
|
||||||
|
train_context_model_parallel_size=train_context_parallel_size
|
||||||
)
|
)
|
||||||
|
|
||||||
if load_format == "megatron":
|
if load_format == "megatron":
|
||||||
update_megatron_weight_loader()
|
update_megatron_weight_loader()
|
||||||
|
|
||||||
torch.jit.script = dummy_compile
|
|
||||||
torch.compile = dummy_compile
|
|
||||||
|
|
||||||
# Initialize the LLM engine
|
# Initialize the LLM engine
|
||||||
self.llm = LLM(
|
self.llm = LLM(
|
||||||
|
seed=1234,
|
||||||
model=tokenizer_name_or_path,
|
model=tokenizer_name_or_path,
|
||||||
trust_remote_code=trust_remote_code,
|
trust_remote_code=trust_remote_code,
|
||||||
tensor_parallel_size=infer_tensor_parallel_size,
|
tensor_parallel_size=infer_tensor_parallel_size,
|
||||||
load_format="dummy" if load_format == "megatron" else "auto",
|
load_format='dummy' if load_format == 'megatron' else load_format,
|
||||||
distributed_executor_backend="external_launcher",
|
distributed_executor_backend="external_launcher",
|
||||||
enable_prefix_caching=enable_prefix_caching,
|
enable_prefix_caching=enable_prefix_caching,
|
||||||
num_scheduler_steps=num_scheduler_steps,
|
num_scheduler_steps=num_scheduler_steps,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
enforce_eager=False,
|
enforce_eager=enforce_eager,
|
||||||
skip_tokenizer_init=False,
|
skip_tokenizer_init=False,
|
||||||
gpu_memory_utilization=gpu_memory_utilization,
|
gpu_memory_utilization=gpu_memory_utilization,
|
||||||
max_num_seqs=max_num_seqs,
|
max_num_seqs=max_num_seqs,
|
||||||
max_model_len=max_model_len
|
max_model_len=max_model_len,
|
||||||
|
additional_config={
|
||||||
|
'expert_tensor_parallel_size': infer_expert_tensor_parallel_size,
|
||||||
|
'enable_graph_mode': int(os.environ.get('VLLM_ENABLE_GRAPH_MODE', '0')),
|
||||||
|
'ascend_scheduler_config': {},
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
self.model = self.llm.llm_engine.model_executor.driver_worker.worker.model_runner.get_model()
|
self.model = self.llm.llm_engine.model_executor.driver_worker.worker.model_runner.get_model()
|
||||||
|
self.kv_cache_configs = None
|
||||||
|
|
||||||
self.cpu_model = {}
|
self.cpu_model = {}
|
||||||
for name, params in self.model.named_parameters():
|
for name, params in self.model.named_parameters():
|
||||||
@ -180,14 +195,61 @@ class VLLMInferEngine(BaseInferEngine):
|
|||||||
|
|
||||||
if load_format == "megatron":
|
if load_format == "megatron":
|
||||||
self.free_cache_engine()
|
self.free_cache_engine()
|
||||||
|
if os.environ['VLLM_USE_V1'] == '1':
|
||||||
|
self._initialize_kv_caches(self.llm.llm_engine.vllm_config)
|
||||||
self.offload_model_weights()
|
self.offload_model_weights()
|
||||||
|
|
||||||
|
from vllm.config import VllmConfig
|
||||||
|
|
||||||
|
def _initialize_kv_caches(self, vllm_config: VllmConfig):
|
||||||
|
|
||||||
|
# Get all kv cache needed by the model
|
||||||
|
kv_cache_specs = self.llm.llm_engine.engine_core.engine_core.model_executor.get_kv_cache_specs()
|
||||||
|
|
||||||
|
# Profiles the peak memory usage of the model to determine how much
|
||||||
|
# memory can be allocated for kv cache.
|
||||||
|
available_gpu_memory = self.llm.llm_engine.engine_core.engine_core.model_executor.determine_available_memory()
|
||||||
|
|
||||||
|
assert len(kv_cache_specs) == len(available_gpu_memory)
|
||||||
|
# Get the kv cache tensor size
|
||||||
|
self.kv_cache_configs = [
|
||||||
|
get_kv_cache_config(vllm_config, kv_cache_spec_one_worker,
|
||||||
|
available_gpu_memory_one_worker)
|
||||||
|
for kv_cache_spec_one_worker, available_gpu_memory_one_worker in
|
||||||
|
zip(kv_cache_specs, available_gpu_memory)
|
||||||
|
]
|
||||||
|
|
||||||
|
# Since we use a shared centralized controller, we need the
|
||||||
|
# `kv_cache_config` to be consistent across all workers to make sure
|
||||||
|
# all the memory operators can be applied to all workers.
|
||||||
|
unify_kv_cache_configs(self.kv_cache_configs)
|
||||||
|
|
||||||
|
# All workers have the same kv_cache_config except layer names, so use
|
||||||
|
# an arbitrary one to initialize the scheduler.
|
||||||
|
assert all([
|
||||||
|
cfg.num_blocks == self.kv_cache_configs[0].num_blocks
|
||||||
|
for cfg in self.kv_cache_configs
|
||||||
|
])
|
||||||
|
|
||||||
def init_cache_engine(self):
|
def init_cache_engine(self):
|
||||||
if self.llm.llm_engine.model_executor.driver_worker.worker.cache_engine is None:
|
if os.environ['VLLM_USE_V1'] == '1':
|
||||||
self.llm.llm_engine.model_executor.driver_worker.worker._init_cache_engine()
|
worker = self.llm.llm_engine.model_executor.driver_worker.worker
|
||||||
|
if not worker.model_runner.kv_caches:
|
||||||
|
# v1 使用显式初始化方法
|
||||||
|
self.llm.llm_engine.engine_core.engine_core.model_executor.initialize_from_config(
|
||||||
|
self.kv_cache_configs)
|
||||||
|
else:
|
||||||
|
if self.llm.llm_engine.model_executor.driver_worker.worker.cache_engine is None:
|
||||||
|
self.llm.llm_engine.model_executor.driver_worker.worker._init_cache_engine()
|
||||||
|
|
||||||
def free_cache_engine(self):
|
def free_cache_engine(self):
|
||||||
ctx = self.llm.llm_engine.model_executor.driver_worker.worker.compilation_config.static_forward_context
|
if os.environ['VLLM_USE_V1'] == '1':
|
||||||
|
worker = self.llm.llm_engine.model_executor.driver_worker.worker
|
||||||
|
|
||||||
|
ctx = worker.model_runner.vllm_config.compilation_config.static_forward_context
|
||||||
|
|
||||||
|
else:
|
||||||
|
ctx = self.llm.llm_engine.model_executor.driver_worker.worker.compilation_config.static_forward_context
|
||||||
from vllm.attention import AttentionType
|
from vllm.attention import AttentionType
|
||||||
|
|
||||||
layer_need_kv_cache = []
|
layer_need_kv_cache = []
|
||||||
@ -201,10 +263,14 @@ class VLLMInferEngine(BaseInferEngine):
|
|||||||
for _ in range(pipeline_parallel_size):
|
for _ in range(pipeline_parallel_size):
|
||||||
kv_cache.append(torch.tensor([]))
|
kv_cache.append(torch.tensor([]))
|
||||||
ctx[layer_name].kv_cache = kv_cache
|
ctx[layer_name].kv_cache = kv_cache
|
||||||
|
if os.environ['VLLM_USE_V1'] == '1':
|
||||||
|
worker = self.llm.llm_engine.model_executor.driver_worker.worker
|
||||||
|
|
||||||
self.llm.llm_engine.model_executor.driver_worker.worker.cache_engine = None
|
# 清理缓存引擎
|
||||||
self.llm.llm_engine.model_executor.driver_worker.worker.gpu_cache = None
|
worker.model_runner.kv_caches = []
|
||||||
|
else:
|
||||||
|
self.llm.llm_engine.model_executor.driver_worker.worker.cache_engine = None
|
||||||
|
self.llm.llm_engine.model_executor.driver_worker.worker.gpu_cache = None
|
||||||
if hasattr(self.model.model.layers[0].self_attn, "attn"):
|
if hasattr(self.model.model.layers[0].self_attn, "attn"):
|
||||||
for i in range(self.model.model.start_layer, self.model.model.end_layer):
|
for i in range(self.model.model.start_layer, self.model.model.end_layer):
|
||||||
attn_impl = self.model.model.layers[i].self_attn.attn.impl
|
attn_impl = self.model.model.layers[i].self_attn.attn.impl
|
||||||
@ -212,18 +278,26 @@ class VLLMInferEngine(BaseInferEngine):
|
|||||||
attn_impl.key_cache = None
|
attn_impl.key_cache = None
|
||||||
attn_impl.value_cache = None
|
attn_impl.value_cache = None
|
||||||
|
|
||||||
self.gpu_cache = None
|
|
||||||
|
|
||||||
gc.collect()
|
gc.collect()
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
|
||||||
def offload_model_weights(self):
|
def offload_model_weights(self):
|
||||||
for name, params in self.model.named_parameters():
|
for name, params in self.model.named_parameters():
|
||||||
params.data = self.cpu_model[name]
|
params.data = self.cpu_model[name]
|
||||||
|
if hasattr(self.model.model.layers[-1].self_attn, "mla_attn"):
|
||||||
|
for i in range(self.model.model.start_layer, self.model.model.end_layer):
|
||||||
|
mla = self.model.model.layers[i].self_attn.mla_attn.impl
|
||||||
|
if hasattr(mla, "w_kc"):
|
||||||
|
mla.w_kc = None
|
||||||
|
mla.w_vc = None
|
||||||
|
if hasattr(mla, "W_UV"):
|
||||||
|
mla.W_UV = None
|
||||||
|
mla.W_UK_T = None
|
||||||
|
|
||||||
def sync_model_weights(self, params, load_format='megatron'):
|
def sync_model_weights(self, params, load_format='megatron'):
|
||||||
infer_parallel_config = InferParallelConfig(self.infer_tensor_parallel_size, self.infer_pipeline_parallel_size,
|
infer_parallel_config = InferParallelConfig(self.infer_tensor_parallel_size, self.infer_pipeline_parallel_size,
|
||||||
self.infer_expert_parallel_size)
|
self.infer_expert_parallel_size * self.infer_tensor_parallel_size)
|
||||||
load_megatron_weights(params,
|
load_megatron_weights(params,
|
||||||
self.model,
|
self.model,
|
||||||
infer_parallel_config,
|
infer_parallel_config,
|
||||||
@ -237,9 +311,12 @@ class VLLMInferEngine(BaseInferEngine):
|
|||||||
if hasattr(mla, "w_kc"):
|
if hasattr(mla, "w_kc"):
|
||||||
mla.w_kc = None
|
mla.w_kc = None
|
||||||
mla.w_vc = None
|
mla.w_vc = None
|
||||||
|
if hasattr(mla, "W_UV"):
|
||||||
|
mla.W_UV = None
|
||||||
|
mla.W_UK_T = None
|
||||||
|
mla.process_weights_after_loading(None)
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
@mstx_timer_decorator
|
|
||||||
def generate_sequences(self, idx_list, **kwargs):
|
def generate_sequences(self, idx_list, **kwargs):
|
||||||
self.init_cache_engine()
|
self.init_cache_engine()
|
||||||
with self.update_sampling_params(**kwargs):
|
with self.update_sampling_params(**kwargs):
|
||||||
@ -325,4 +402,4 @@ def get_local_rank() -> int:
|
|||||||
|
|
||||||
# Default to 0 (for testing or single-process scenarios)
|
# Default to 0 (for testing or single-process scenarios)
|
||||||
logger.warning("Warning: Unable to determine local rank. Defaulting to 0.")
|
logger.warning("Warning: Unable to determine local rank. Defaulting to 0.")
|
||||||
return 0
|
return 0
|
@ -259,8 +259,8 @@ class _HuggingFaceTokenizer(BaseTokenizer):
|
|||||||
def tokenize(self, text):
|
def tokenize(self, text):
|
||||||
return self.tokenizer(text).input_ids
|
return self.tokenizer(text).input_ids
|
||||||
|
|
||||||
def detokenize(self, token_ids):
|
def detokenize(self, token_ids, **kwargs):
|
||||||
return self.tokenizer.decode(token_ids)
|
return self.tokenizer.decode(token_ids, **kwargs)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def eod(self):
|
def eod(self):
|
||||||
|
@ -88,6 +88,9 @@ class ActorHybridWorkerBase(BaseWorker):
|
|||||||
|
|
||||||
self.inference_model = self._build_rollout()
|
self.inference_model = self._build_rollout()
|
||||||
self.sharding_manager = self._build_sharding_manager()
|
self.sharding_manager = self._build_sharding_manager()
|
||||||
|
|
||||||
|
if self.generate_config.offload_train_param:
|
||||||
|
self.actor_offloader.onload_param()
|
||||||
|
|
||||||
self.actor_hybrid = ActorRolloutHybrid(
|
self.actor_hybrid = ActorRolloutHybrid(
|
||||||
self.model,
|
self.model,
|
||||||
@ -178,7 +181,7 @@ class ActorHybridWorkerBase(BaseWorker):
|
|||||||
self.args.consumed_train_samples += self.megatron_config.global_batch_size // self.rl_config.n_samples_per_prompt
|
self.args.consumed_train_samples += self.megatron_config.global_batch_size // self.rl_config.n_samples_per_prompt
|
||||||
self.num_floating_point_operations_so_far += num_floating_point_operations(self.args,
|
self.num_floating_point_operations_so_far += num_floating_point_operations(self.args,
|
||||||
self.megatron_config.global_batch_size)
|
self.megatron_config.global_batch_size)
|
||||||
if self.parallel_state.is_pipeline_last_stage() and self.parallel_state.get_tensor_model_parallel_rank() == 0:
|
if self.parallel_state.is_pipeline_last_stage(ignore_virtual=True) and self.parallel_state.get_tensor_model_parallel_rank() == 0:
|
||||||
ray.get(self.td.update_metrics.remote(value=metrics, cumulate=True))
|
ray.get(self.td.update_metrics.remote(value=metrics, cumulate=True))
|
||||||
ray.get(
|
ray.get(
|
||||||
self.td.update_metrics.remote(
|
self.td.update_metrics.remote(
|
||||||
@ -371,6 +374,7 @@ class ActorHybridWorkerBase(BaseWorker):
|
|||||||
train_tensor_parallel_size=self.megatron_config.tensor_model_parallel_size,
|
train_tensor_parallel_size=self.megatron_config.tensor_model_parallel_size,
|
||||||
train_pipeline_parallel_size=self.megatron_config.pipeline_model_parallel_size,
|
train_pipeline_parallel_size=self.megatron_config.pipeline_model_parallel_size,
|
||||||
train_expert_parallel_size=self.megatron_config.expert_model_parallel_size,
|
train_expert_parallel_size=self.megatron_config.expert_model_parallel_size,
|
||||||
|
train_context_parallel_size=self.megatron_config.context_parallel_size,
|
||||||
infer_tensor_parallel_size=self.generate_config.infer_tensor_parallel_size,
|
infer_tensor_parallel_size=self.generate_config.infer_tensor_parallel_size,
|
||||||
infer_pipeline_parallel_size=self.generate_config.infer_pipeline_parallel_size,
|
infer_pipeline_parallel_size=self.generate_config.infer_pipeline_parallel_size,
|
||||||
infer_expert_parallel_size=self.generate_config.infer_expert_parallel_size,
|
infer_expert_parallel_size=self.generate_config.infer_expert_parallel_size,
|
||||||
@ -382,9 +386,9 @@ class ActorHybridWorkerBase(BaseWorker):
|
|||||||
max_model_len=self.generate_config.max_model_len,
|
max_model_len=self.generate_config.max_model_len,
|
||||||
dtype=self.generate_config.dtype,
|
dtype=self.generate_config.dtype,
|
||||||
gpu_memory_utilization=self.generate_config.gpu_memory_utilization,
|
gpu_memory_utilization=self.generate_config.gpu_memory_utilization,
|
||||||
trust_remote_code=self.generate_config.trust_remote_code
|
trust_remote_code=self.generate_config.trust_remote_code,
|
||||||
|
enforce_eager=self.generate_config.enforce_eager,
|
||||||
)
|
)
|
||||||
|
|
||||||
return rollout
|
return rollout
|
||||||
|
|
||||||
def _build_sharding_manager(self):
|
def _build_sharding_manager(self):
|
||||||
@ -404,7 +408,8 @@ class ActorHybridWorkerBase(BaseWorker):
|
|||||||
grad_offload=self.generate_config.offload_train_grad,
|
grad_offload=self.generate_config.offload_train_grad,
|
||||||
train_param_offload=self.generate_config.offload_train_param,
|
train_param_offload=self.generate_config.offload_train_param,
|
||||||
enable_validate=self.rl_config.enable_sharding_validate,
|
enable_validate=self.rl_config.enable_sharding_validate,
|
||||||
megatron_offloader=self.actor_offloader
|
megatron_offloader=self.actor_offloader,
|
||||||
|
noop_layers=self.megatron_config.noop_layers
|
||||||
)
|
)
|
||||||
return sharding_manager
|
return sharding_manager
|
||||||
|
|
||||||
|
@ -21,20 +21,27 @@ Manager used to shard weight and offload/onload optimizer from training stage to
|
|||||||
from itertools import chain
|
from itertools import chain
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
|
import os
|
||||||
import torch
|
import torch
|
||||||
import torch.distributed
|
import torch.distributed as dist
|
||||||
|
import vllm.distributed.parallel_state as ps
|
||||||
|
from mindspeed_rl.utils.loggers import Loggers
|
||||||
|
|
||||||
from mindspeed_rl.workers.resharding.vllm_weight_container import MegatronStyleVllmWeightContainer
|
from mindspeed_rl.workers.resharding.vllm_weight_container import MegatronStyleVllmWeightContainer
|
||||||
from mindspeed_rl.workers.resharding.weight_adaptor import get_weight_adaptor
|
from mindspeed_rl.workers.resharding.weight_adaptor import get_weight_adaptor
|
||||||
from mindspeed_rl.utils.utils import mstx_timer_decorator
|
from mindspeed_rl.utils.utils import mstx_timer_decorator
|
||||||
|
|
||||||
|
logger = Loggers(
|
||||||
|
name="vllm_engine_inference",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class MegatronOffLoader:
|
class MegatronOffLoader:
|
||||||
def __init__(self, megatron_model=None, optimizer=None, wrap_with_ddp=True):
|
def __init__(self, megatron_model=None, optimizer=None, wrap_with_ddp=True):
|
||||||
self.optimizer = optimizer
|
self.optimizer = optimizer
|
||||||
self.model = megatron_model
|
self.model = megatron_model
|
||||||
self.wrap_with_ddp = wrap_with_ddp
|
self.wrap_with_ddp = wrap_with_ddp
|
||||||
|
|
||||||
self.tensor_to_cpu_states_map = dict()
|
self.tensor_to_cpu_states_map = dict()
|
||||||
|
|
||||||
@mstx_timer_decorator
|
@mstx_timer_decorator
|
||||||
@ -51,18 +58,29 @@ class MegatronOffLoader:
|
|||||||
|
|
||||||
@mstx_timer_decorator
|
@mstx_timer_decorator
|
||||||
def offload_optimizer(self):
|
def offload_optimizer(self):
|
||||||
for param_group in self.optimizer.optimizer.param_groups:
|
if hasattr(self.optimizer, "chained_optimizers"):
|
||||||
for param in param_group['params']:
|
optimizers = self.optimizer.chained_optimizers
|
||||||
param.data = param.data.to("cpu", non_blocking=False)
|
else:
|
||||||
self.optimizer.optimizer.state = self._move_to_device(self.optimizer.optimizer.state, "cpu")
|
optimizers = [self.optimizer]
|
||||||
|
for optimizer in optimizers:
|
||||||
|
for param_group in optimizer.optimizer.param_groups:
|
||||||
|
for param in param_group['params']:
|
||||||
|
param.data = param.data.to("cpu", non_blocking=False)
|
||||||
|
optimizer.optimizer.state = self._move_to_device(optimizer.optimizer.state,
|
||||||
|
"cpu")
|
||||||
|
|
||||||
@mstx_timer_decorator
|
@mstx_timer_decorator
|
||||||
def onload_optimizer(self):
|
def onload_optimizer(self):
|
||||||
for param_group in self.optimizer.optimizer.param_groups:
|
if hasattr(self.optimizer, "chained_optimizers"):
|
||||||
for param in param_group['params']:
|
optimizers = self.optimizer.chained_optimizers
|
||||||
param.data = param.data.to(torch.cuda.current_device(), non_blocking=False)
|
else:
|
||||||
self.optimizer.optimizer.state = self._move_to_device(self.optimizer.optimizer.state,
|
optimizers = [self.optimizer]
|
||||||
torch.cuda.current_device())
|
for optimizer in optimizers:
|
||||||
|
for param_group in optimizer.optimizer.param_groups:
|
||||||
|
for param in param_group['params']:
|
||||||
|
param.data = param.data.to(torch.cuda.current_device(), non_blocking=False)
|
||||||
|
optimizer.optimizer.state = self._move_to_device(optimizer.optimizer.state,
|
||||||
|
torch.cuda.current_device())
|
||||||
|
|
||||||
@mstx_timer_decorator
|
@mstx_timer_decorator
|
||||||
def _move_to_device(self, data, device):
|
def _move_to_device(self, data, device):
|
||||||
@ -133,7 +151,8 @@ class MegatronShardingManager:
|
|||||||
num_layer_list=None,
|
num_layer_list=None,
|
||||||
moe_tp_extend_ep=None,
|
moe_tp_extend_ep=None,
|
||||||
parallel_state=None,
|
parallel_state=None,
|
||||||
megatron_offloader=None
|
megatron_offloader=None,
|
||||||
|
noop_layers=None
|
||||||
):
|
):
|
||||||
"""Megatron Sharding Manager initialization.
|
"""Megatron Sharding Manager initialization.
|
||||||
|
|
||||||
@ -169,13 +188,13 @@ class MegatronShardingManager:
|
|||||||
moe_tp_extend_ep=moe_tp_extend_ep,
|
moe_tp_extend_ep=moe_tp_extend_ep,
|
||||||
parallel_state=parallel_state,
|
parallel_state=parallel_state,
|
||||||
weight_adaptor=self.weight_adaptor,
|
weight_adaptor=self.weight_adaptor,
|
||||||
enable_validate=enable_validate)
|
enable_validate=enable_validate,
|
||||||
|
noop_layers=noop_layers)
|
||||||
|
|
||||||
self.optimizer_offload = optimizer_offload
|
self.optimizer_offload = optimizer_offload
|
||||||
self.grad_offload = grad_offload
|
self.grad_offload = grad_offload
|
||||||
self.train_param_offload = train_param_offload
|
self.train_param_offload = train_param_offload
|
||||||
self.enable_validate = enable_validate
|
self.enable_validate = enable_validate
|
||||||
self.use_distributed_optimizer = self.optimizer.config.use_distributed_optimizer
|
|
||||||
self.inference_engine.offload_model_weights()
|
self.inference_engine.offload_model_weights()
|
||||||
self.megatron_offloader = megatron_offloader
|
self.megatron_offloader = megatron_offloader
|
||||||
|
|
||||||
@ -206,8 +225,6 @@ class MegatronShardingManager:
|
|||||||
3. do resharding
|
3. do resharding
|
||||||
4. offload training param
|
4. offload training param
|
||||||
"""
|
"""
|
||||||
if self.train_param_offload:
|
|
||||||
self.megatron_offloader.onload_param()
|
|
||||||
|
|
||||||
self.onload_infer_params()
|
self.onload_infer_params()
|
||||||
|
|
||||||
@ -215,9 +232,9 @@ class MegatronShardingManager:
|
|||||||
|
|
||||||
if self.train_param_offload:
|
if self.train_param_offload:
|
||||||
self.megatron_offloader.offload_param()
|
self.megatron_offloader.offload_param()
|
||||||
|
|
||||||
self.inference_engine.sync_model_weights(infer_params, load_format='megatron')
|
self.inference_engine.sync_model_weights(infer_params, load_format='megatron')
|
||||||
|
|
||||||
|
|
||||||
@mstx_timer_decorator
|
@mstx_timer_decorator
|
||||||
def exit_infer_mode(self):
|
def exit_infer_mode(self):
|
||||||
"""
|
"""
|
||||||
|
@ -71,7 +71,7 @@ class MemoryBuffer:
|
|||||||
if param_name not in self.tensor_indices:
|
if param_name not in self.tensor_indices:
|
||||||
raise KeyError(f"Parameter {param_name} not found in the buffer.")
|
raise KeyError(f"Parameter {param_name} not found in the buffer.")
|
||||||
|
|
||||||
start_index, shape = self.tensor_indices[param_name]
|
start_index, shape = self.tensor_indices[param_name] # weight_name -- index shape
|
||||||
return self.get(shape, start_index)
|
return self.get(shape, start_index)
|
||||||
|
|
||||||
|
|
||||||
@ -82,6 +82,15 @@ def calc_padded_numel(shape: torch.Size, dtype: torch.dtype):
|
|||||||
return (numel + align_numel - 1) // align_numel * align_numel
|
return (numel + align_numel - 1) // align_numel * align_numel
|
||||||
|
|
||||||
|
|
||||||
|
# 构建EP增大的buffer———构造一个experts_weight_buffer_meta
|
||||||
|
def get_weight_buffer_meta_from_buffer(weight_buffer_meta) -> Dict[str, Dict]:
|
||||||
|
experts_weight_buffer_meta = {}
|
||||||
|
for name, meta_info in sorted(weight_buffer_meta.items()):
|
||||||
|
if "mlp.experts" in name:
|
||||||
|
experts_weight_buffer_meta[name] = meta_info
|
||||||
|
return experts_weight_buffer_meta
|
||||||
|
|
||||||
|
|
||||||
def build_memory_buffer(weight_buffer_meta: Dict[str, Dict]) -> Dict[torch.dtype, MemoryBuffer]:
|
def build_memory_buffer(weight_buffer_meta: Dict[str, Dict]) -> Dict[torch.dtype, MemoryBuffer]:
|
||||||
"""Build the memory buffer given weight_buffer_meta
|
"""Build the memory buffer given weight_buffer_meta
|
||||||
|
|
||||||
@ -123,8 +132,61 @@ def build_memory_buffer(weight_buffer_meta: Dict[str, Dict]) -> Dict[torch.dtype
|
|||||||
return memory_buffers
|
return memory_buffers
|
||||||
|
|
||||||
|
|
||||||
|
def build_experts_memory_buffer(experts_weight_buffer_meta: Dict[str, Dict], experts_memory_expend_N) -> Dict[torch.dtype, MemoryBuffer]:
|
||||||
|
"""Build the experts memory buffer given experts_weight_buffer_meta
|
||||||
|
|
||||||
|
Args:
|
||||||
|
weight_buffer_meta: contains mapping from name to a dictionary containing shape and dtype of the tensors
|
||||||
|
|
||||||
|
Returns: a large memory buffer for each dtype that can hold all the tensors
|
||||||
|
|
||||||
|
"""
|
||||||
|
experts_memory_buffers = {}
|
||||||
|
total_numel_map = {} # map from dtype to the total numel
|
||||||
|
|
||||||
|
for _, meta_info in sorted(experts_weight_buffer_meta.items()):
|
||||||
|
shape = meta_info['shape']
|
||||||
|
shape = torch.Size([experts_memory_expend_N, shape[0], shape[1], shape[2]])
|
||||||
|
dtype = meta_info['dtype']
|
||||||
|
|
||||||
|
if not isinstance(shape, torch.Size):
|
||||||
|
raise TypeError("Shape must be an instance of torch.Size")
|
||||||
|
if not isinstance(dtype, torch.dtype):
|
||||||
|
raise TypeError("dtype must be an instance of torch.dtype")
|
||||||
|
if dtype not in total_numel_map:
|
||||||
|
total_numel_map[dtype] = 0
|
||||||
|
|
||||||
|
tmp_numel = calc_padded_numel(shape, dtype)
|
||||||
|
total_numel_map[dtype] += tmp_numel
|
||||||
|
|
||||||
|
|
||||||
|
for dtype, total_numel in total_numel_map.items():
|
||||||
|
# Create a buffer for each dtype with the total numel
|
||||||
|
experts_memory_buffers[dtype] = MemoryBuffer(total_numel, total_numel, dtype)
|
||||||
|
|
||||||
|
# Now, insert each tensor's index and shape for later retrieval by name
|
||||||
|
current_index_map = {} # This keeps track of the current memory index for each dtype
|
||||||
|
for name, meta_info in sorted(experts_weight_buffer_meta.items()):
|
||||||
|
shape = meta_info['shape']
|
||||||
|
shape = torch.Size([experts_memory_expend_N, shape[0], shape[1], shape[2]])
|
||||||
|
dtype = meta_info['dtype']
|
||||||
|
buffer = experts_memory_buffers[dtype]
|
||||||
|
tensor_size = calc_padded_numel(shape, dtype)
|
||||||
|
|
||||||
|
start_index = current_index_map.get(dtype, 0)
|
||||||
|
current_index_map[dtype] = start_index + tensor_size
|
||||||
|
|
||||||
|
buffer.tensor_indices[name] = (start_index, shape)
|
||||||
|
|
||||||
|
return experts_memory_buffers
|
||||||
|
|
||||||
|
|
||||||
def build_model_weight_buffer(model: nn.Module, names_per_pp: List[str], get_weight_buffer_meta):
|
def build_model_weight_buffer(model: nn.Module, names_per_pp: List[str], get_weight_buffer_meta):
|
||||||
memory_buffers = [ModelWeightBuffer(model, weight_names, get_weight_buffer_meta) for weight_names in names_per_pp]
|
combined_names_per_pp = [[] for _ in names_per_pp]
|
||||||
|
for pp_rank, vpp_stages in enumerate(names_per_pp):
|
||||||
|
for weight_names_per_stage in vpp_stages:
|
||||||
|
combined_names_per_pp[pp_rank].extend(weight_names_per_stage)
|
||||||
|
memory_buffers = [ModelWeightBuffer(model, weight_names, get_weight_buffer_meta) for weight_names in combined_names_per_pp]
|
||||||
return memory_buffers
|
return memory_buffers
|
||||||
|
|
||||||
|
|
||||||
@ -139,7 +201,7 @@ class ModelWeightBuffer:
|
|||||||
self.weight_buffer_meta = self.get_weight_buffer_meta(self.model, weight_names)
|
self.weight_buffer_meta = self.get_weight_buffer_meta(self.model, weight_names)
|
||||||
self.weight_names = list(self.weight_buffer_meta.keys())
|
self.weight_names = list(self.weight_buffer_meta.keys())
|
||||||
self.memory_buffers = None
|
self.memory_buffers = None
|
||||||
# self.memory_buffers = build_memory_buffer(self.weight_buffer_meta)
|
|
||||||
|
|
||||||
def __getitem__(self, weight_name: str) -> torch.Tensor:
|
def __getitem__(self, weight_name: str) -> torch.Tensor:
|
||||||
return self.get_weight_by_name(weight_name)
|
return self.get_weight_by_name(weight_name)
|
||||||
|
@ -25,12 +25,16 @@ import torch.distributed as dist
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from torch.distributed import new_group
|
from torch.distributed import new_group
|
||||||
|
import vllm.distributed.parallel_state as ps
|
||||||
|
|
||||||
from mindspeed_rl.workers.resharding.memory_buffer import build_model_weight_buffer
|
from mindspeed_rl.workers.resharding.memory_buffer import build_model_weight_buffer, calc_padded_numel
|
||||||
import mindspeed_rl.workers.resharding.utils
|
import mindspeed_rl.workers.resharding.utils
|
||||||
from mindspeed_rl.workers.resharding.utils import get_tensor_parallel_partition_dim, tp_md5_validate, \
|
from mindspeed_rl.workers.resharding.utils import get_tensor_parallel_partition_dim, tp_md5_validate, \
|
||||||
update_md5_by_rank, compute_md5, validate_md5, _build_infer_param_dict, get_tp_allgather_group, \
|
update_md5_by_rank, compute_md5, validate_md5, _build_infer_param_dict, get_tp_allgather_group, \
|
||||||
get_tp_allgather_world_size, is_tensor_parallel_param, get_tp_group, is_fake_tp_param
|
get_tp_allgather_world_size, is_tensor_parallel_param, get_tp_group, is_fake_tp_param
|
||||||
|
from mindspeed_rl.utils.loggers import Loggers
|
||||||
|
|
||||||
|
logger = Loggers(__name__)
|
||||||
|
|
||||||
|
|
||||||
class MegatronStyleVllmWeightContainer:
|
class MegatronStyleVllmWeightContainer:
|
||||||
@ -42,7 +46,8 @@ class MegatronStyleVllmWeightContainer:
|
|||||||
moe_tp_extend_ep=False,
|
moe_tp_extend_ep=False,
|
||||||
parallel_state=None,
|
parallel_state=None,
|
||||||
weight_adaptor=None,
|
weight_adaptor=None,
|
||||||
enable_validate=False) -> None:
|
enable_validate=False,
|
||||||
|
noop_layers=None) -> None:
|
||||||
""" Megatron style vllm weight container.
|
""" Megatron style vllm weight container.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
@ -64,16 +69,26 @@ class MegatronStyleVllmWeightContainer:
|
|||||||
self.megatron_model = megatron_model
|
self.megatron_model = megatron_model
|
||||||
self.parallel_state = parallel_state
|
self.parallel_state = parallel_state
|
||||||
self.weight_adaptor = weight_adaptor
|
self.weight_adaptor = weight_adaptor
|
||||||
self._num_hidden_layers = self.model_config.num_hidden_layers
|
self._num_hidden_layers = self.model_config.num_hidden_layers # 通过tokenier路径下的config.json获取hf的模型
|
||||||
|
self._noop_layers = None
|
||||||
|
if noop_layers is not None:
|
||||||
|
self._noop_layers = [int(layer_idx) for layer_idx in noop_layers.split(',')]
|
||||||
|
self._num_hidden_layers += len(self._noop_layers)
|
||||||
|
|
||||||
# pp configs
|
# pp configs
|
||||||
self._pp_rank = self.parallel_state.get_pipeline_model_parallel_rank()
|
self._pp_rank = self.parallel_state.get_pipeline_model_parallel_rank()
|
||||||
self._pp_group = self.parallel_state.get_pipeline_model_parallel_group()
|
self._pp_group = self.parallel_state.get_pipeline_model_parallel_group()
|
||||||
self._pp_size = self.parallel_state.get_pipeline_model_parallel_world_size()
|
self._pp_size = self.parallel_state.get_pipeline_model_parallel_world_size()
|
||||||
|
self._world_size = dist.get_world_size()
|
||||||
|
self.pp_group_size = self._world_size // self._pp_size
|
||||||
|
## vpp
|
||||||
self._num_layer_list = self._build_num_layer_list(num_layer_list)
|
self._num_layer_list = self._build_num_layer_list(num_layer_list)
|
||||||
self._vpp_size = self.parallel_state._VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK if self.parallel_state._VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK else 1
|
self._vpp_rank = self.parallel_state._VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK if self.parallel_state._VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK else 0
|
||||||
self._vpp_rank = self.parallel_state._VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE if self.parallel_state._VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE else 0
|
self._vpp_size = self.parallel_state._VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE if self.parallel_state._VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE else 1
|
||||||
|
self._vpp_layer_list = self._build_vpp_layer_list(self._num_layer_list)
|
||||||
|
## _noop_layers
|
||||||
|
self._global2local_map = self._build_global2local_map(self._vpp_layer_list, self._vpp_size, self._noop_layers) if self._noop_layers is not None else None
|
||||||
|
|
||||||
# tp configs
|
# tp configs
|
||||||
self._tp_size = self.parallel_state.get_tensor_model_parallel_world_size()
|
self._tp_size = self.parallel_state.get_tensor_model_parallel_world_size()
|
||||||
self._tp_group = self.parallel_state.get_tensor_model_parallel_group()
|
self._tp_group = self.parallel_state.get_tensor_model_parallel_group()
|
||||||
@ -97,7 +112,11 @@ class MegatronStyleVllmWeightContainer:
|
|||||||
self._infer_ep_size = infer_expert_parallel_size
|
self._infer_ep_size = infer_expert_parallel_size
|
||||||
self.moe_tp_extend_ep = moe_tp_extend_ep
|
self.moe_tp_extend_ep = moe_tp_extend_ep
|
||||||
|
|
||||||
self._world_size = dist.get_world_size()
|
# TODO: infer_expert_tensor_parallel_size and num_process is fixed.
|
||||||
|
self.infer_expert_tensor_parallel_size = 1
|
||||||
|
self.num_process = 1
|
||||||
|
self._infer_ep_size = self._infer_ep_size * self._infer_tp_size
|
||||||
|
self.experts_memory_expend_N = self._infer_ep_size // self._ep_size
|
||||||
|
|
||||||
# validate parallel configs
|
# validate parallel configs
|
||||||
self._validate_parallel_config()
|
self._validate_parallel_config()
|
||||||
@ -116,10 +135,9 @@ class MegatronStyleVllmWeightContainer:
|
|||||||
def _validate_parallel_config(self):
|
def _validate_parallel_config(self):
|
||||||
if self._infer_pp_size != 1:
|
if self._infer_pp_size != 1:
|
||||||
raise ValueError("infer_pp_size != 1 not supported yet")
|
raise ValueError("infer_pp_size != 1 not supported yet")
|
||||||
if self._infer_ep_size != 1:
|
|
||||||
raise ValueError("infer_ep_size != 1 not supported yet")
|
if self._infer_ep_size % self._ep_size != 0:
|
||||||
if self._ep_size > 1 and self._ep_size != self._infer_tp_size:
|
raise ValueError("The training expert size should be divisibled by the inference expert size.")
|
||||||
raise ValueError("For training EP, supports EP -> TP only currently.")
|
|
||||||
if self._ep_size > 1 and not self.moe_tp_extend_ep:
|
if self._ep_size > 1 and not self.moe_tp_extend_ep:
|
||||||
raise ValueError("To enable training EP, you need to enable moe_tp_extend_ep and use GroupedMLP.")
|
raise ValueError("To enable training EP, you need to enable moe_tp_extend_ep and use GroupedMLP.")
|
||||||
if self._pp_size < self._infer_pp_size:
|
if self._pp_size < self._infer_pp_size:
|
||||||
@ -149,6 +167,12 @@ class MegatronStyleVllmWeightContainer:
|
|||||||
|
|
||||||
self._update_weight_buffers_intra_pp()
|
self._update_weight_buffers_intra_pp()
|
||||||
self._update_weight_buffers_inter_pp()
|
self._update_weight_buffers_inter_pp()
|
||||||
|
|
||||||
|
# 执行_update_weight_buffers_ep+_send_receive_experts的前提条件
|
||||||
|
if(self.moe_tp_extend_ep and self._infer_ep_size >= self._ep_size):
|
||||||
|
self._update_weight_buffers_ep()
|
||||||
|
self._send_receive_experts()
|
||||||
|
|
||||||
params = self._get_all_params()
|
params = self._get_all_params()
|
||||||
|
|
||||||
params = _build_infer_param_dict(params=params)
|
params = _build_infer_param_dict(params=params)
|
||||||
@ -161,27 +185,55 @@ class MegatronStyleVllmWeightContainer:
|
|||||||
raise ValueError("num_layers % pp_size == 0, please specify num_layer_list")
|
raise ValueError("num_layers % pp_size == 0, please specify num_layer_list")
|
||||||
return [self._num_hidden_layers // self._pp_size for _ in range(self._pp_size)]
|
return [self._num_hidden_layers // self._pp_size for _ in range(self._pp_size)]
|
||||||
|
|
||||||
|
def _build_vpp_layer_list(self, num_layer_list):
|
||||||
|
if self._vpp_size <= 1:
|
||||||
|
return num_layer_list
|
||||||
|
for layers_in_pp_rank in num_layer_list:
|
||||||
|
if layers_in_pp_rank % self._vpp_size != 0:
|
||||||
|
raise ValueError("num_layers_per_pp % vpp_size != 0, please specify pp_size and vpp_size")
|
||||||
|
return [int(layers_in_pp_rank / self._vpp_size) for layers_in_pp_rank in num_layer_list]
|
||||||
|
|
||||||
|
def _build_global2local_map(self, layer_list, vpp_size, noop_layers):
|
||||||
|
stage_layers_num = sum(layer_list)
|
||||||
|
glb2local_map = []
|
||||||
|
for vpp_rank in range(vpp_size):
|
||||||
|
start_layer = vpp_rank * stage_layers_num
|
||||||
|
for _, layers_in_vpp_rank in enumerate(layer_list):
|
||||||
|
layer_idx_list = [
|
||||||
|
layer_idx for layer_idx in range(start_layer, start_layer + layers_in_vpp_rank)
|
||||||
|
if layer_idx not in noop_layers
|
||||||
|
]
|
||||||
|
glb2local_map += [layer_idx % layers_in_vpp_rank for layer_idx in layer_idx_list]
|
||||||
|
start_layer += layers_in_vpp_rank
|
||||||
|
|
||||||
|
return glb2local_map
|
||||||
|
|
||||||
def _unwrap_megatron_model(self, model):
|
def _unwrap_megatron_model(self, model):
|
||||||
"""
|
"""
|
||||||
Remove consecutive 'module.' prefixes from the model based on the state_dict's first key.
|
Remove consecutive 'module.' prefixes from the model based on the state_dict's first key.
|
||||||
This method only removes 'module.' from the beginning of the key and ignores other occurrences.
|
This method only removes 'module.' from the beginning of the key and ignores other occurrences.
|
||||||
"""
|
"""
|
||||||
model = model[0]
|
unwraped_model = []
|
||||||
first_key = list(dict(model.named_parameters()).keys())[0]
|
for model_chunk in model:
|
||||||
while first_key.startswith("module."):
|
first_key = list(dict(model_chunk.named_parameters()).keys())[0]
|
||||||
model = model.module
|
while first_key.startswith("module."):
|
||||||
first_key = first_key[len("module."):] # 更新键,去掉一个module.
|
model_chunk = model_chunk.module
|
||||||
return model
|
first_key = first_key[len("module."):]
|
||||||
|
unwraped_model.append(model_chunk)
|
||||||
|
return unwraped_model
|
||||||
|
|
||||||
def _init_weight_buffers(self):
|
def _init_weight_buffers(self):
|
||||||
"""
|
"""
|
||||||
Build buffers from vllm state dict. Totally build train pp_size buffers, each buffer corresponds to a pack of megatron weight.
|
Build buffers from vllm state dict. Totally build train pp_size buffers, each buffer corresponds to a pack of megatron weight.
|
||||||
Return a list of buffers, and a reference dict megatron_param_name->buffer.
|
Return a list of buffers, and a reference dict megatron_param_name->buffer.
|
||||||
"""
|
"""
|
||||||
vllm_names = list(dict(self.vllm_model.named_parameters()).keys())
|
vllm_names = list(dict(self.vllm_model.named_parameters()).keys()) # 获取每个pp内部的weights name
|
||||||
self.weight_names_per_pp = self.weight_adaptor.get_weight_names_per_pp(self._num_layer_list, vllm_names)
|
self.weight_names_per_pp = self.weight_adaptor.get_weight_names_per_pp(self._vpp_layer_list, vllm_names,
|
||||||
|
sum(self._num_layer_list), self._vpp_size, self._noop_layers)
|
||||||
|
|
||||||
self.weight_buffers = build_model_weight_buffer(self.vllm_model, self.weight_names_per_pp,
|
self.weight_buffers = build_model_weight_buffer(self.vllm_model, self.weight_names_per_pp,
|
||||||
self.weight_adaptor.get_weight_buffer_meta)
|
self.weight_adaptor.get_weight_buffer_meta
|
||||||
|
)
|
||||||
|
|
||||||
def trans_ep_params_to_tp(self, megatron_param, name):
|
def trans_ep_params_to_tp(self, megatron_param, name):
|
||||||
"""
|
"""
|
||||||
@ -264,7 +316,11 @@ class MegatronStyleVllmWeightContainer:
|
|||||||
async_op=False
|
async_op=False
|
||||||
)
|
)
|
||||||
total_experts = self.num_local_experts * tp_size
|
total_experts = self.num_local_experts * tp_size
|
||||||
return torch.cat(output_tensor_list, dim=1).reshape(hidden_size, total_experts, -1).permute(1, 0, 2)
|
res = torch.cat(output_tensor_list, dim=1).reshape(hidden_size, total_experts, -1)
|
||||||
|
if 'weight2' in name:
|
||||||
|
return res.permute(1, 2, 0).contiguous()
|
||||||
|
return res.permute(1, 0, 2).contiguous()
|
||||||
|
|
||||||
|
|
||||||
def _update_weight_buffers_intra_pp(self):
|
def _update_weight_buffers_intra_pp(self):
|
||||||
"""
|
"""
|
||||||
@ -281,35 +337,107 @@ class MegatronStyleVllmWeightContainer:
|
|||||||
return infer_param
|
return infer_param
|
||||||
|
|
||||||
pp_rank = self._pp_rank
|
pp_rank = self._pp_rank
|
||||||
weight_buffer = self.weight_buffers[pp_rank]
|
weight_names = self.weight_names_per_pp[pp_rank]
|
||||||
|
weight_names_meta = self.weight_adaptor.convert_weight_name_meta(weight_names)
|
||||||
true_megatron_model = self._unwrap_megatron_model(self.megatron_model)
|
true_megatron_model = self._unwrap_megatron_model(self.megatron_model)
|
||||||
normal_layer_func = partial(self.weight_adaptor.global2local_layer, num_layer_list=self._num_layer_list)
|
normal_layer_func = partial(self.weight_adaptor.global2local_layer, num_layer_list=self._vpp_layer_list, global2local_map=self._global2local_map)
|
||||||
name_pairs = sorted(list(set([(name, self.weight_adaptor.replace_name_i2t(normal_layer_func(name)))
|
name_pairs = sorted(list(set([(name, vpp_rank, self.weight_adaptor.replace_name_i2t(normal_layer_func(name, vpp_rank=vpp_rank)))
|
||||||
for name in weight_buffer.weight_names])))
|
for vpp_rank, names_per_vpp in enumerate(weight_names_meta) for name in names_per_vpp])))
|
||||||
|
|
||||||
if self.enable_validate:
|
if self.enable_validate:
|
||||||
self.origin_params_for_md5 = hashlib.md5()
|
self.origin_params_for_md5 = hashlib.md5()
|
||||||
self.infer_params_for_md5 = [hashlib.md5() for _ in range(get_tp_allgather_world_size())]
|
self.infer_params_for_md5 = [hashlib.md5() for _ in range(get_tp_allgather_world_size())]
|
||||||
for hf_name, megatron_name in name_pairs:
|
|
||||||
|
# 检查 linear_fc1 和 linear_fc2 权重形状是否符合特定关系(fc1 包含门控和扩展参数,因此大小是 fc2 的两倍)。不符合条件的模型不被支持。
|
||||||
|
for _, vpp_rank, megatron_name in name_pairs:
|
||||||
if megatron_name.endswith("linear_fc1.weight"):
|
if megatron_name.endswith("linear_fc1.weight"):
|
||||||
fc2_name = megatron_name.replace("linear_fc1", "linear_fc2")
|
fc2_name = megatron_name.replace("linear_fc1", "linear_fc2")
|
||||||
megatron_param_fc1 = dict(true_megatron_model.named_parameters())[megatron_name]
|
megatron_param_fc1 = dict(true_megatron_model[vpp_rank].named_parameters())[megatron_name]
|
||||||
megatron_param_fc2 = dict(true_megatron_model.named_parameters())[fc2_name]
|
megatron_param_fc2 = dict(true_megatron_model[vpp_rank].named_parameters())[fc2_name]
|
||||||
if megatron_param_fc1.shape[0] * megatron_param_fc1.shape[1] != megatron_param_fc2.shape[0] * \
|
if megatron_param_fc1.shape[0] * megatron_param_fc1.shape[1] != megatron_param_fc2.shape[0] * \
|
||||||
megatron_param_fc2.shape[1] * 2:
|
megatron_param_fc2.shape[1] * 2:
|
||||||
raise ValueError("Only implemented for Llama model which linear_fc1 contains gate and up params.")
|
raise ValueError("Only implemented for Llama model which linear_fc1 contains gate and up params.")
|
||||||
|
|
||||||
megatron_params_dict = dict(true_megatron_model.named_buffers())
|
weight_buffer = self.weight_buffers[pp_rank]
|
||||||
megatron_params_dict.update(true_megatron_model.named_parameters())
|
megatron_params_dict = {}
|
||||||
for hf_name, megatron_name in name_pairs:
|
for vpp_rank in range(self._vpp_size):
|
||||||
megatron_param = megatron_params_dict[megatron_name]
|
megatron_params_dict.update({vpp_rank: dict(true_megatron_model[vpp_rank].named_buffers())})
|
||||||
param = _transfer_from_megatron_division(megatron_param, megatron_name)
|
megatron_params_dict[vpp_rank].update(true_megatron_model[vpp_rank].named_parameters())
|
||||||
weight_buffer.copy_by_name(hf_name, param)
|
|
||||||
|
for hf_name, vpp_rank, megatron_name in name_pairs:
|
||||||
|
if((self._infer_ep_size > 1 or self._ep_size > 1) and "mlp.experts" in megatron_name):
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
megatron_param = megatron_params_dict[vpp_rank][megatron_name]
|
||||||
|
param = _transfer_from_megatron_division(megatron_param, megatron_name)
|
||||||
|
weight_buffer.copy_by_name(hf_name, param)
|
||||||
|
|
||||||
# tp md5 validate
|
# tp md5 validate
|
||||||
if self.enable_validate:
|
if self.enable_validate:
|
||||||
tp_md5_validate(self.infer_params_for_md5, self.origin_params_for_md5,
|
tp_md5_validate(self.infer_params_for_md5, self.origin_params_for_md5,
|
||||||
f"rank[{self._rank}] tp params allgather")
|
f"rank[{self._rank}] tp params allgather")
|
||||||
|
|
||||||
|
def _update_weight_buffers_ep(self):
|
||||||
|
# 构造临时的experts_memory_buffers
|
||||||
|
for cur_pp_rank in range(self._pp_size):
|
||||||
|
pp_rank = self._pp_rank
|
||||||
|
from mindspeed_rl.workers.resharding.memory_buffer import build_experts_memory_buffer, get_weight_buffer_meta_from_buffer
|
||||||
|
# Step1 在当前的PP_rank中,设置一个临时的exprts_buffer
|
||||||
|
combined_names_per_pp = []
|
||||||
|
vpp_stages = self.weight_names_per_pp[cur_pp_rank]
|
||||||
|
for weight_names_per_stage in vpp_stages:
|
||||||
|
combined_names_per_pp.extend(weight_names_per_stage)
|
||||||
|
self.weight_buffer_meta = self.weight_adaptor.get_weight_buffer_meta(self.vllm_model, combined_names_per_pp)
|
||||||
|
self.experts_weight_buffer_meta = get_weight_buffer_meta_from_buffer(self.weight_buffer_meta)
|
||||||
|
self.experts_memory_buffers = build_experts_memory_buffer(self.experts_weight_buffer_meta, self.experts_memory_expend_N)
|
||||||
|
|
||||||
|
# Step2 将weights_buffer上对应的权重放到experts_buffer中
|
||||||
|
if(cur_pp_rank == pp_rank):
|
||||||
|
weight_names = self.weight_names_per_pp[pp_rank]
|
||||||
|
weight_names_meta = self.weight_adaptor.convert_weight_name_meta(weight_names)
|
||||||
|
normal_layer_func = partial(self.weight_adaptor.global2local_layer, num_layer_list=self._vpp_layer_list, global2local_map=self._global2local_map)
|
||||||
|
name_pairs = sorted(list(set([(name, vpp_rank, self.weight_adaptor.replace_name_i2t(normal_layer_func(name, vpp_rank=vpp_rank)))
|
||||||
|
for vpp_rank, names_per_vpp in enumerate(weight_names_meta) for name in names_per_vpp])))
|
||||||
|
true_megatron_model = self._unwrap_megatron_model(self.megatron_model)
|
||||||
|
|
||||||
|
megatron_params_dict = {}
|
||||||
|
# 拿到当前pp的所有权重
|
||||||
|
for vpp_rank in range(self._vpp_size):
|
||||||
|
megatron_params_dict.update({vpp_rank: dict(true_megatron_model[vpp_rank].named_buffers())})
|
||||||
|
megatron_params_dict[vpp_rank].update(true_megatron_model[vpp_rank].named_parameters())
|
||||||
|
|
||||||
|
for hf_name, vpp_rank, megatron_name in name_pairs:
|
||||||
|
if((self._infer_ep_size > 1 or self._ep_size > 1) and "mlp.experts" in megatron_name):
|
||||||
|
megatron_param = megatron_params_dict[vpp_rank][megatron_name]
|
||||||
|
dtype = self.experts_weight_buffer_meta[hf_name]['dtype']
|
||||||
|
self.experts_memory_buffers[dtype].copy_by_name(hf_name, megatron_param)
|
||||||
|
|
||||||
|
# Step3 后续的操作可以复用
|
||||||
|
global_src = dist.get_global_rank(group=self._pp_group, group_rank=cur_pp_rank)
|
||||||
|
|
||||||
|
# broadcast专家权重(experts memory buffer中的)
|
||||||
|
for dtype, experts_memory_buffer in self.experts_memory_buffers.items():
|
||||||
|
dist.broadcast(tensor=experts_memory_buffer.data, src=global_src, group=self._pp_group, async_op=False)
|
||||||
|
pp_group_rank = self._rank // self.pp_group_size
|
||||||
|
|
||||||
|
# 获取对应的dtype
|
||||||
|
for name, tensor_indices_value in sorted(experts_memory_buffer.tensor_indices.items()):
|
||||||
|
shape = tensor_indices_value[1] # 是*N的
|
||||||
|
index = pp_group_rank % self.experts_memory_expend_N
|
||||||
|
experts_tensor = experts_memory_buffer.get_by_name(name)
|
||||||
|
experts_tensor_reshape = experts_tensor.view(shape)
|
||||||
|
weight_tensor_infer = experts_tensor_reshape[index]
|
||||||
|
self.weight_buffers[cur_pp_rank].copy_by_name(name, weight_tensor_infer)
|
||||||
|
|
||||||
|
# 卸载专家的buffer
|
||||||
|
experts_memory_buffer = None
|
||||||
|
self.experts_memory_buffers[dtype] = None
|
||||||
|
|
||||||
|
for memory_buffer in self.experts_memory_buffers.values():
|
||||||
|
memory_buffer = None
|
||||||
|
self.experts_memory_buffers = None
|
||||||
|
|
||||||
|
|
||||||
def _update_weight_buffers_inter_pp(self):
|
def _update_weight_buffers_inter_pp(self):
|
||||||
"""
|
"""
|
||||||
Update weight buffers by gathering weights from other pp stage.
|
Update weight buffers by gathering weights from other pp stage.
|
||||||
@ -328,6 +456,36 @@ class MegatronStyleVllmWeightContainer:
|
|||||||
dist.broadcast(md5_tensor_src, group=self._pp_group, src=global_src, async_op=False)
|
dist.broadcast(md5_tensor_src, group=self._pp_group, src=global_src, async_op=False)
|
||||||
validate_md5(md5_tensor_src, md5_tensor, f"rank[{self._rank}] pp resharding params")
|
validate_md5(md5_tensor_src, md5_tensor, f"rank[{self._rank}] pp resharding params")
|
||||||
|
|
||||||
|
|
||||||
|
def get_expert_router(self, cur_rank, train_tp_ep_size, infer_tp_ep_size, world_size):
|
||||||
|
for tp_ep_group_id in range(world_size // infer_tp_ep_size):
|
||||||
|
tp_ep_group = [i for i in range(tp_ep_group_id * infer_tp_ep_size, (tp_ep_group_id + 1) * infer_tp_ep_size)]
|
||||||
|
if cur_rank in tp_ep_group:
|
||||||
|
self.INFER_TP_EP_GROUP = tp_ep_group
|
||||||
|
stride = infer_tp_ep_size // train_tp_ep_size
|
||||||
|
dev_array = np.array(self.INFER_TP_EP_GROUP).reshape(stride, train_tp_ep_size)
|
||||||
|
src_router = np.squeeze(dev_array.transpose().reshape(1, infer_tp_ep_size)).tolist()
|
||||||
|
src = src_router[cur_rank % infer_tp_ep_size]
|
||||||
|
dst = self.INFER_TP_EP_GROUP[src_router.index(cur_rank)]
|
||||||
|
return src, dst
|
||||||
|
|
||||||
|
def _send_receive_experts(self):
|
||||||
|
cur_rank = dist.get_rank()
|
||||||
|
src_rank, dst_rank = self.get_expert_router(cur_rank, self._ep_size, self._infer_ep_size, self._world_size)
|
||||||
|
for cur_pp_rank in range(self._pp_size):
|
||||||
|
for memory_buffer in self.weight_buffers[cur_pp_rank].memory_buffers.values():
|
||||||
|
for name in sorted(memory_buffer.tensor_indices.keys()):
|
||||||
|
if "mlp.experts" in name:
|
||||||
|
# 做收发
|
||||||
|
tensor_to_send = memory_buffer.get_by_name(name)
|
||||||
|
tensor_to_replace = torch.empty_like(tensor_to_send)
|
||||||
|
send_op = dist.P2POp(dist.isend, tensor_to_send, dst_rank)
|
||||||
|
recv_op = dist.P2POp(dist.irecv, tensor_to_replace, src_rank)
|
||||||
|
reqs = dist.batch_isend_irecv([send_op, recv_op])
|
||||||
|
for req in reqs:
|
||||||
|
req.wait()
|
||||||
|
memory_buffer.copy_by_name(name, tensor_to_replace)
|
||||||
|
|
||||||
def _get_all_params(self):
|
def _get_all_params(self):
|
||||||
"""Get all the parameters of the models in all pp ranks
|
"""Get all the parameters of the models in all pp ranks
|
||||||
|
|
||||||
@ -353,7 +511,7 @@ class MegatronStyleVllmWeightContainer:
|
|||||||
return
|
return
|
||||||
if self._tp_size % self._infer_tp_size != 0:
|
if self._tp_size % self._infer_tp_size != 0:
|
||||||
raise ValueError("self._tp_size must be divisible by self._infer_tp_size")
|
raise ValueError("self._tp_size must be divisible by self._infer_tp_size")
|
||||||
tp_allgather_size = self._tp_size // self._infer_tp_size
|
tp_allgather_size = self._tp_size
|
||||||
if mindspeed_rl.workers.resharding.utils._TP_ALLGATHER_GROUP is not None:
|
if mindspeed_rl.workers.resharding.utils._TP_ALLGATHER_GROUP is not None:
|
||||||
raise RuntimeError("Group for allgather tensor model parallel weight is already initialized")
|
raise RuntimeError("Group for allgather tensor model parallel weight is already initialized")
|
||||||
num_groups = self._world_size // tp_allgather_size
|
num_groups = self._world_size // tp_allgather_size
|
||||||
@ -432,7 +590,7 @@ class MegatronStyleVllmWeightContainer:
|
|||||||
2. split train_tp params into groups (size: infer_tp_size)
|
2. split train_tp params into groups (size: infer_tp_size)
|
||||||
3. return the corresponding param from group based on infer tp rank
|
3. return the corresponding param from group based on infer tp rank
|
||||||
"""
|
"""
|
||||||
if self._infer_tp_size <= self._tp_size:
|
if self._infer_tp_size <= self._tp_size or is_fake_tp_param(name, self.moe_tp_extend_ep):
|
||||||
return param
|
return param
|
||||||
|
|
||||||
tp_group = get_tp_group()
|
tp_group = get_tp_group()
|
||||||
@ -494,6 +652,9 @@ class MegatronStyleVllmWeightContainer:
|
|||||||
torch.distributed.all_gather(infer_param, param, group=tp_allgather_group)
|
torch.distributed.all_gather(infer_param, param, group=tp_allgather_group)
|
||||||
if self.enable_validate:
|
if self.enable_validate:
|
||||||
update_md5_by_rank(infer_param, param, self.origin_params_for_md5, self.infer_params_for_md5)
|
update_md5_by_rank(infer_param, param, self.origin_params_for_md5, self.infer_params_for_md5)
|
||||||
infer_param = self._default_tp_concat_fn(name, param, infer_param)
|
part_len = len(infer_param) // self._infer_tp_size
|
||||||
|
start = self._rank % self._infer_tp_size
|
||||||
|
part_param = infer_param[part_len * start:part_len * (start + 1)]
|
||||||
|
infer_param = self._default_tp_concat_fn(name, param, part_param)
|
||||||
|
|
||||||
return infer_param
|
return infer_param
|
||||||
|
@ -42,6 +42,7 @@ class MegatronVLLMWeightAdaptor(BaseWeightAdaptor):
|
|||||||
def __init__(self, model_config):
|
def __init__(self, model_config):
|
||||||
super(MegatronVLLMWeightAdaptor, self).__init__()
|
super(MegatronVLLMWeightAdaptor, self).__init__()
|
||||||
self.model_config = model_config
|
self.model_config = model_config
|
||||||
|
self.meta_info = None
|
||||||
self.params_mapping = [
|
self.params_mapping = [
|
||||||
# (megatron core gpt model name, vllm model name)
|
# (megatron core gpt model name, vllm model name)
|
||||||
("embedding.word_embeddings", "model.embed_tokens"),
|
("embedding.word_embeddings", "model.embed_tokens"),
|
||||||
@ -92,6 +93,8 @@ class MegatronVLLMWeightAdaptor(BaseWeightAdaptor):
|
|||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def convert_weight_name_meta(self, weight_names):
|
||||||
|
return weight_names
|
||||||
|
|
||||||
def get_weight_buffer_meta(self, model, valid_names=None):
|
def get_weight_buffer_meta(self, model, valid_names=None):
|
||||||
weight_buffer_meta = {}
|
weight_buffer_meta = {}
|
||||||
@ -103,11 +106,12 @@ class MegatronVLLMWeightAdaptor(BaseWeightAdaptor):
|
|||||||
return weight_buffer_meta
|
return weight_buffer_meta
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def global2local_layer(name, num_layer_list):
|
def global2local_layer(name, num_layer_list, vpp_rank=0, global2local_map=None):
|
||||||
"""
|
"""
|
||||||
Transform the model name in each model_chunk in global space to local space
|
Transform the model name in each model_chunk in global space to local space
|
||||||
"""
|
"""
|
||||||
layer_name = 'layers'
|
layer_name = 'layers'
|
||||||
|
num_layer_offset = vpp_rank * sum(num_layer_list)
|
||||||
|
|
||||||
if layer_name in name: # belong to an intermediate layer
|
if layer_name in name: # belong to an intermediate layer
|
||||||
split_name = name.split('.')
|
split_name = name.split('.')
|
||||||
@ -122,12 +126,15 @@ class MegatronVLLMWeightAdaptor(BaseWeightAdaptor):
|
|||||||
raise ValueError(f'split_name = {split_name}')
|
raise ValueError(f'split_name = {split_name}')
|
||||||
|
|
||||||
# increment layer_num_idx by layer_offset
|
# increment layer_num_idx by layer_offset
|
||||||
global_idx = int(split_name[layer_num_idx])
|
if global2local_map is None:
|
||||||
for layers_in_pp in num_layer_list:
|
global_idx = int(split_name[layer_num_idx]) - num_layer_offset
|
||||||
global_idx -= layers_in_pp
|
for layers_in_pp in num_layer_list:
|
||||||
if global_idx < 0:
|
global_idx -= layers_in_pp
|
||||||
local_index = global_idx + layers_in_pp
|
if global_idx < 0:
|
||||||
break
|
local_index = global_idx + layers_in_pp
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
local_index = global2local_map[int(split_name[layer_num_idx])]
|
||||||
|
|
||||||
split_name[layer_num_idx] = str(local_index)
|
split_name[layer_num_idx] = str(local_index)
|
||||||
name = '.'.join(split_name) # weight name in inference_tp_model
|
name = '.'.join(split_name) # weight name in inference_tp_model
|
||||||
@ -135,15 +142,27 @@ class MegatronVLLMWeightAdaptor(BaseWeightAdaptor):
|
|||||||
return name
|
return name
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_weight_names_per_pp(layer_list, vllm_names):
|
def get_weight_names_per_pp(layer_list, vllm_names, layers_num=None, vpp_size=0, noop_layers=None):
|
||||||
|
## add protection for default kwargs
|
||||||
|
if not layers_num:
|
||||||
|
if vpp_size > 0:
|
||||||
|
ValueError(f"layers_num is required with vpp_size = {vpp_size}")
|
||||||
|
layers_num = sum(layer_list)
|
||||||
|
|
||||||
end_layer = sum(layer_list) - 1
|
end_layer = layers_num - 1
|
||||||
|
|
||||||
def get_weight_names_in_range(layer_range, names: list, layer_name='layers') -> list:
|
def get_weight_names_in_range(layer_range, names: list, noop_layers=None, layer_name='layers') -> list:
|
||||||
"""
|
"""
|
||||||
Extract weights in a given range and also include the weights before and after the range as needed.
|
Extract weights in a given range and also include the weights before and after the range as needed.
|
||||||
"""
|
"""
|
||||||
start, end = layer_range
|
start, end = layer_range
|
||||||
|
|
||||||
|
layer_idx_list = [layer_idx for layer_idx in range(start, end + 1)]
|
||||||
|
if noop_layers:
|
||||||
|
layer_idx_list = [
|
||||||
|
layer_idx - sum(1 for i in noop_layers if i <= layer_idx) for layer_idx in layer_idx_list if
|
||||||
|
layer_idx not in noop_layers
|
||||||
|
]
|
||||||
last_layer_index = end_layer
|
last_layer_index = end_layer
|
||||||
names_in_range = []
|
names_in_range = []
|
||||||
|
|
||||||
@ -160,7 +179,7 @@ class MegatronVLLMWeightAdaptor(BaseWeightAdaptor):
|
|||||||
match = re.match(r'.*\.layers\.(\d+)', name)
|
match = re.match(r'.*\.layers\.(\d+)', name)
|
||||||
if match:
|
if match:
|
||||||
layer_num = int(match.group(1))
|
layer_num = int(match.group(1))
|
||||||
if start <= layer_num <= end:
|
if layer_num in layer_idx_list:
|
||||||
names_in_range.append(name)
|
names_in_range.append(name)
|
||||||
|
|
||||||
# add names after decode layers
|
# add names after decode layers
|
||||||
@ -172,13 +191,18 @@ class MegatronVLLMWeightAdaptor(BaseWeightAdaptor):
|
|||||||
break
|
break
|
||||||
return names_in_range
|
return names_in_range
|
||||||
|
|
||||||
pp_layers_range = []
|
stage_layers_num = sum(layer_list)
|
||||||
start_layer = 0
|
weight_names_per_vpp_combined = [[] for _ in layer_list]
|
||||||
for layers_in_pp_rank in layer_list:
|
for vpp_rank in range(vpp_size):
|
||||||
pp_layers_range.append((start_layer, start_layer + layers_in_pp_rank - 1))
|
start_layer = vpp_rank * stage_layers_num
|
||||||
start_layer += layers_in_pp_rank
|
for pp_rank, layers_in_vpp_rank in enumerate(layer_list):
|
||||||
weight_names_per_pp = [get_weight_names_in_range(layer_range, vllm_names) for layer_range in pp_layers_range]
|
vpp_layers_range = (start_layer, start_layer + layers_in_vpp_rank - 1)
|
||||||
return weight_names_per_pp
|
weight_names_per_vpp = get_weight_names_in_range(vpp_layers_range, vllm_names, noop_layers)
|
||||||
|
weight_names_per_vpp_combined[pp_rank].append(weight_names_per_vpp)
|
||||||
|
|
||||||
|
start_layer += layers_in_vpp_rank
|
||||||
|
|
||||||
|
return weight_names_per_vpp_combined
|
||||||
|
|
||||||
|
|
||||||
class DeepSeekMVWeightAdaptor(MegatronVLLMWeightAdaptor):
|
class DeepSeekMVWeightAdaptor(MegatronVLLMWeightAdaptor):
|
||||||
@ -187,6 +211,8 @@ class DeepSeekMVWeightAdaptor(MegatronVLLMWeightAdaptor):
|
|||||||
"""
|
"""
|
||||||
def __init__(self, model_config):
|
def __init__(self, model_config):
|
||||||
super(DeepSeekMVWeightAdaptor, self).__init__(model_config)
|
super(DeepSeekMVWeightAdaptor, self).__init__(model_config)
|
||||||
|
self.meta_info = {'replace': {'kv_a_proj_with_mqa': 'qkv_proj'},
|
||||||
|
'delete': ['q_a_proj']}
|
||||||
self.params_mapping = [
|
self.params_mapping = [
|
||||||
# (megatron core gpt model name, vllm model name)
|
# (megatron core gpt model name, vllm model name)
|
||||||
("embedding.word_embeddings", "model.embed_tokens"),
|
("embedding.word_embeddings", "model.embed_tokens"),
|
||||||
@ -216,16 +242,48 @@ class DeepSeekMVWeightAdaptor(MegatronVLLMWeightAdaptor):
|
|||||||
if valid_names and name not in valid_names:
|
if valid_names and name not in valid_names:
|
||||||
continue
|
continue
|
||||||
if 'kv_a_proj_with_mqa' in name:
|
if 'kv_a_proj_with_mqa' in name:
|
||||||
q_param = dict(model.named_parameters()).get(name.replace('kv_a_proj_with_mqa', 'q_a_proj'))
|
# 将kv_a_proj_with_mqa和q_a_proj的tensor拼接,并用qkv_proj和拼接的结果替换掉原来kv_a_proj_with_mqa的对应部分
|
||||||
|
q_param = dict(model.named_parameters()).get(name.replace('kv_a_proj_with_mqa', 'q_a_proj' if self.model_config.q_lora_rank else "q_proj"))
|
||||||
qkv_param_shape = torch.cat([q_param, param], dim=0).shape
|
qkv_param_shape = torch.cat([q_param, param], dim=0).shape
|
||||||
qkv_name = name.replace('kv_a_proj_with_mqa', 'qkv_proj')
|
qkv_name = name.replace('kv_a_proj_with_mqa', 'qkv_proj')
|
||||||
weight_buffer_meta[qkv_name] = {'shape': qkv_param_shape, 'dtype': param.dtype}
|
weight_buffer_meta[qkv_name] = {'shape': qkv_param_shape, 'dtype': param.dtype}
|
||||||
elif 'q_a_proj' in name:
|
elif 'q_a_proj' in name or 'q_proj' in name:
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
weight_buffer_meta[name] = {'shape': param.shape, 'dtype': param.dtype}
|
weight_buffer_meta[name] = {'shape': param.shape, 'dtype': param.dtype}
|
||||||
return weight_buffer_meta
|
return weight_buffer_meta
|
||||||
|
|
||||||
|
def convert_weight_name_meta(self, weight_names):
|
||||||
|
if not self.meta_info:
|
||||||
|
return weight_names
|
||||||
|
|
||||||
|
weight_names_meta = list()
|
||||||
|
for elements in weight_names:
|
||||||
|
if isinstance(elements, list):
|
||||||
|
tmp_weight_names_meta = self.convert_weight_name_meta(elements)
|
||||||
|
weight_names_meta.append(tmp_weight_names_meta)
|
||||||
|
else:
|
||||||
|
converted = False
|
||||||
|
if not converted and 'replace' in self.meta_info:
|
||||||
|
for key, value in self.meta_info['replace'].items():
|
||||||
|
if key in elements:
|
||||||
|
qkv_name = elements.replace(key, value)
|
||||||
|
weight_names_meta.append(qkv_name)
|
||||||
|
converted = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if not converted and 'delete' in self.meta_info:
|
||||||
|
for key in self.meta_info['delete']:
|
||||||
|
if key in elements:
|
||||||
|
converted = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if not converted:
|
||||||
|
weight_names_meta.append(elements)
|
||||||
|
|
||||||
|
return weight_names_meta
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class QwenMVWeightAdaptor(MegatronVLLMWeightAdaptor):
|
class QwenMVWeightAdaptor(MegatronVLLMWeightAdaptor):
|
||||||
"""
|
"""
|
||||||
@ -239,6 +297,7 @@ WEIGHT_ADAPTOR_REGISTRY = {
|
|||||||
"Qwen2ForCausalLM": QwenMVWeightAdaptor,
|
"Qwen2ForCausalLM": QwenMVWeightAdaptor,
|
||||||
"DeepseekV3ForCausalLM": DeepSeekMVWeightAdaptor,
|
"DeepseekV3ForCausalLM": DeepSeekMVWeightAdaptor,
|
||||||
"DeepseekV2ForCausalLM": DeepSeekMVWeightAdaptor,
|
"DeepseekV2ForCausalLM": DeepSeekMVWeightAdaptor,
|
||||||
|
"CustomDeepseekV3ForCausalLM": DeepSeekMVWeightAdaptor,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -85,6 +85,7 @@ rl_config:
|
|||||||
num_npus: 8
|
num_npus: 8
|
||||||
|
|
||||||
generate_config:
|
generate_config:
|
||||||
|
enforce_eager: True
|
||||||
gen_micro_batch_size: 16
|
gen_micro_batch_size: 16
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
offload_train_optimizer: true
|
offload_train_optimizer: true
|
||||||
|
@ -1,24 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
export CUDA_DEVICE_MAX_CONNECTIONS=1
|
|
||||||
# 获取脚本的绝对路径
|
|
||||||
SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
|
|
||||||
export PYTHONPATH=$SCRIPT_DIR/../../..:$PYTHONPATH
|
|
||||||
|
|
||||||
GPUS_PER_NODE=1
|
|
||||||
MASTER_ADDR=localhost
|
|
||||||
MASTER_PORT=6555
|
|
||||||
NNODES=1
|
|
||||||
NODE_RANK=0
|
|
||||||
|
|
||||||
DISTRIBUTED_ARGS="
|
|
||||||
--nproc_per_node $GPUS_PER_NODE \
|
|
||||||
--nnodes $NNODES \
|
|
||||||
--node_rank $NODE_RANK \
|
|
||||||
--master_addr $MASTER_ADDR \
|
|
||||||
--master_port $MASTER_PORT
|
|
||||||
"
|
|
||||||
echo "start test_vllm_engine st"
|
|
||||||
|
|
||||||
torchrun $DISTRIBUTED_ARGS $SCRIPT_DIR/test_vllm_engine.py --distribute-backend nccl
|
|
||||||
torchrun $DISTRIBUTED_ARGS $SCRIPT_DIR/test_vllm_engine_multistep_decode.py --distribute-backend nccl
|
|
@ -1,140 +0,0 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
# Copyright (c) Huawei Technologies Co., Ltd.2023-2025. All rights reserved.
|
|
||||||
import os
|
|
||||||
import logging
|
|
||||||
|
|
||||||
import tensordict
|
|
||||||
import torch
|
|
||||||
from torch_npu.contrib import transfer_to_npu
|
|
||||||
|
|
||||||
from mindspeed_rl.models.rollout.vllm_engine import VLLMInferEngine
|
|
||||||
from mindspeed_rl.config_cls.megatron_config import MegatronConfig
|
|
||||||
from mindspeed_rl.utils.loggers import Loggers
|
|
||||||
|
|
||||||
tokenizer_name_or_path = "/data/for_dt/tokenizer/Llama-3.2-1B-Instruct/"
|
|
||||||
weights_path = "/data/for_dt/weights/Llama-3.2-1B-tp1pp1/iter_0000001/mp_rank_00/model_optim_rng.pt"
|
|
||||||
megatron_dict = {"num_attention_heads": 32,
|
|
||||||
"tensor_model_parallel_size": 1,
|
|
||||||
"num_query_groups": 8,
|
|
||||||
"group_query_attention": True}
|
|
||||||
sampling_config = {
|
|
||||||
"num_completions": 1, # 每个输入提示生成的独立完成项数量
|
|
||||||
"logprobs": 1, # 返回的 top token 的对数概率数量
|
|
||||||
"max_tokens": 128, # 生成输出的最大 token 数量
|
|
||||||
"best_of": 2, # 内部生成候选完成项的数量,从中选择最佳的一个
|
|
||||||
"top_p": 1.0, # 核采样的累积概率阈值
|
|
||||||
"top_k": 50, # 采样时考虑的最高概率 token 的数量
|
|
||||||
"min_p": 0.0, # token 选择的最小概率阈值
|
|
||||||
"temperature": 0.2, # 控制预测随机性的温度参数
|
|
||||||
"detokenize": False # 是否将生成的 token 转换回可读字符串
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
logger = Loggers(
|
|
||||||
name="test_vllm_engine",
|
|
||||||
)
|
|
||||||
logger.info("start test_vllm_engine")
|
|
||||||
|
|
||||||
conversation = [
|
|
||||||
{
|
|
||||||
"role": "system",
|
|
||||||
"content": "You are a helpful assistant"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": "Hello"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"role": "assistant",
|
|
||||||
"content": "Hello! How can I assist you today?"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": "Write an essay about the importance of higher education.",
|
|
||||||
},
|
|
||||||
]
|
|
||||||
|
|
||||||
logger.info("load megatron weight")
|
|
||||||
megatron_st = torch.load(weights_path)
|
|
||||||
actor_weights = megatron_st['model']
|
|
||||||
|
|
||||||
# 配置初始化所需的参数
|
|
||||||
|
|
||||||
train_tensor_parallel_size = 1
|
|
||||||
train_pipeline_parallel_size = 1
|
|
||||||
infer_tensor_parallel_size = 1
|
|
||||||
infer_pipeline_parallel_size = 1
|
|
||||||
train_expert_parallel_size = 1
|
|
||||||
infer_expert_parallel_size = 1
|
|
||||||
max_num_seqs = 256
|
|
||||||
trust_remote_code = True
|
|
||||||
|
|
||||||
logger.info("enter vllmInferEngine")
|
|
||||||
|
|
||||||
megatron_config = MegatronConfig(megatron_dict, {})
|
|
||||||
megatron_config.num_attention_heads = 32
|
|
||||||
megatron_config.tensor_model_parallel_size = 1
|
|
||||||
megatron_config.num_query_groups = 8
|
|
||||||
megatron_config.num_key_value_heads = 8
|
|
||||||
megatron_config.group_query_attention = True
|
|
||||||
# 初始化 VLLMInferEngine 实例
|
|
||||||
inference_engine = VLLMInferEngine(
|
|
||||||
megatron_config=megatron_config,
|
|
||||||
sampling_config=sampling_config,
|
|
||||||
train_expert_parallel_size=train_expert_parallel_size,
|
|
||||||
infer_expert_parallel_size=infer_expert_parallel_size,
|
|
||||||
tokenizer_name_or_path=tokenizer_name_or_path,
|
|
||||||
train_tensor_parallel_size=train_tensor_parallel_size,
|
|
||||||
train_pipeline_parallel_size=train_pipeline_parallel_size,
|
|
||||||
infer_tensor_parallel_size=infer_tensor_parallel_size,
|
|
||||||
infer_pipeline_parallel_size=infer_pipeline_parallel_size,
|
|
||||||
max_num_seqs=max_num_seqs,
|
|
||||||
trust_remote_code=trust_remote_code
|
|
||||||
)
|
|
||||||
|
|
||||||
logger.info("model inited")
|
|
||||||
inference_engine.free_cache_engine()
|
|
||||||
torch.cuda.empty_cache()
|
|
||||||
logger.info("free_cache")
|
|
||||||
|
|
||||||
inference_engine.offload_model_weights()
|
|
||||||
logger.info("offload_model")
|
|
||||||
torch.cuda.empty_cache()
|
|
||||||
logger.info("empty_cache")
|
|
||||||
|
|
||||||
logger.info("enter sync_model_weights")
|
|
||||||
inference_engine.sync_model_weights(actor_weights)
|
|
||||||
|
|
||||||
logger.info("enter init_cache_engine")
|
|
||||||
inference_engine.init_cache_engine()
|
|
||||||
|
|
||||||
logger.info("=" * 80)
|
|
||||||
logger.info("start chat")
|
|
||||||
outputs = inference_engine.chat(conversation)
|
|
||||||
logger.info("chat result is ", outputs)
|
|
||||||
|
|
||||||
idx_list = []
|
|
||||||
idx_list_per_step = []
|
|
||||||
for i in range(2):
|
|
||||||
for j in range(4):
|
|
||||||
tokens = torch.randint(100, (10,))
|
|
||||||
idx_list_per_step.append(tokens.view(-1).cpu().numpy().tolist())
|
|
||||||
idx_list.extend(idx_list_per_step)
|
|
||||||
idx_list_per_step = []
|
|
||||||
logger.info(len(idx_list), [len(i) for i in idx_list])
|
|
||||||
|
|
||||||
logger.info("start test generate_sequences ")
|
|
||||||
outputs = inference_engine.generate_sequences(
|
|
||||||
idx_list=idx_list,
|
|
||||||
)
|
|
||||||
logger.info("generate_sequences output is:")
|
|
||||||
logger.info(outputs[0])
|
|
||||||
|
|
||||||
logger.info("input")
|
|
||||||
logger.info(idx_list[0])
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
@ -1,141 +0,0 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
# Copyright (c) Huawei Technologies Co., Ltd.2023-2025. All rights reserved.
|
|
||||||
import os
|
|
||||||
import logging
|
|
||||||
|
|
||||||
import tensordict
|
|
||||||
import torch
|
|
||||||
from torch_npu.contrib import transfer_to_npu
|
|
||||||
|
|
||||||
from mindspeed_rl.models.rollout.vllm_engine import VLLMInferEngine
|
|
||||||
from mindspeed_rl.config_cls.megatron_config import MegatronConfig
|
|
||||||
from mindspeed_rl.utils.loggers import Loggers
|
|
||||||
|
|
||||||
tokenizer_name_or_path = "/data/for_dt/tokenizer/Llama-3.2-1B-Instruct/"
|
|
||||||
weights_path = "/data/for_dt/weights/Llama-3.2-1B-tp1pp1/iter_0000001/mp_rank_00/model_optim_rng.pt"
|
|
||||||
megatron_dict = {"num_attention_heads": 32,
|
|
||||||
"tensor_model_parallel_size": 1,
|
|
||||||
"num_query_groups": 8,
|
|
||||||
"group_query_attention": True}
|
|
||||||
sampling_config = {
|
|
||||||
"num_completions": 1, # 每个输入提示生成的独立完成项数量
|
|
||||||
"logprobs": 1, # 返回的 top token 的对数概率数量
|
|
||||||
"max_tokens": 128, # 生成输出的最大 token 数量
|
|
||||||
"best_of": 2, # 内部生成候选完成项的数量,从中选择最佳的一个
|
|
||||||
"top_p": 1.0, # 核采样的累积概率阈值
|
|
||||||
"top_k": 50, # 采样时考虑的最高概率 token 的数量
|
|
||||||
"min_p": 0.0, # token 选择的最小概率阈值
|
|
||||||
"temperature": 0.2, # 控制预测随机性的温度参数
|
|
||||||
"detokenize": False # 是否将生成的 token 转换回可读字符串
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
logger = Loggers(
|
|
||||||
name="test_vllm_engine_multistep_decode",
|
|
||||||
)
|
|
||||||
logger.info("start test_vllm_engine_multistep_decode")
|
|
||||||
|
|
||||||
conversation = [
|
|
||||||
{
|
|
||||||
"role": "system",
|
|
||||||
"content": "You are a helpful assistant"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": "Hello"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"role": "assistant",
|
|
||||||
"content": "Hello! How can I assist you today?"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": "Write an essay about the importance of higher education.",
|
|
||||||
},
|
|
||||||
]
|
|
||||||
|
|
||||||
logger.info("load megatron weight")
|
|
||||||
megatron_st = torch.load(weights_path)
|
|
||||||
actor_weights = megatron_st['model']
|
|
||||||
|
|
||||||
# 配置初始化所需的参数
|
|
||||||
|
|
||||||
train_tensor_parallel_size = 1
|
|
||||||
train_pipeline_parallel_size = 1
|
|
||||||
infer_tensor_parallel_size = 1
|
|
||||||
infer_pipeline_parallel_size = 1
|
|
||||||
train_expert_parallel_size = 1
|
|
||||||
infer_expert_parallel_size = 1
|
|
||||||
max_num_seqs = 256
|
|
||||||
trust_remote_code = True
|
|
||||||
|
|
||||||
logger.info("enter vllmInferEngine")
|
|
||||||
|
|
||||||
megatron_config = MegatronConfig(megatron_dict, {})
|
|
||||||
megatron_config.num_attention_heads = 32
|
|
||||||
megatron_config.tensor_model_parallel_size = 1
|
|
||||||
megatron_config.num_query_groups = 8
|
|
||||||
megatron_config.num_key_value_heads = 8
|
|
||||||
megatron_config.group_query_attention = True
|
|
||||||
# 初始化 VLLMInferEngine 实例
|
|
||||||
inference_engine = VLLMInferEngine(
|
|
||||||
megatron_config=megatron_config,
|
|
||||||
sampling_config=sampling_config,
|
|
||||||
train_expert_parallel_size=train_expert_parallel_size,
|
|
||||||
infer_expert_parallel_size=infer_expert_parallel_size,
|
|
||||||
tokenizer_name_or_path=tokenizer_name_or_path,
|
|
||||||
train_tensor_parallel_size=train_tensor_parallel_size,
|
|
||||||
train_pipeline_parallel_size=train_pipeline_parallel_size,
|
|
||||||
infer_tensor_parallel_size=infer_tensor_parallel_size,
|
|
||||||
infer_pipeline_parallel_size=infer_pipeline_parallel_size,
|
|
||||||
max_num_seqs=max_num_seqs,
|
|
||||||
trust_remote_code=trust_remote_code,
|
|
||||||
num_scheduler_steps=8, # 8 decode steps
|
|
||||||
)
|
|
||||||
|
|
||||||
logger.info("model inited")
|
|
||||||
inference_engine.free_cache_engine()
|
|
||||||
torch.cuda.empty_cache()
|
|
||||||
logger.info("free_cache")
|
|
||||||
|
|
||||||
inference_engine.offload_model_weights()
|
|
||||||
logger.info("offload_model")
|
|
||||||
torch.cuda.empty_cache()
|
|
||||||
logger.info("empty_cache")
|
|
||||||
|
|
||||||
logger.info("enter sync_model_weights")
|
|
||||||
inference_engine.sync_model_weights(actor_weights)
|
|
||||||
|
|
||||||
logger.info("enter init_cache_engine")
|
|
||||||
inference_engine.init_cache_engine()
|
|
||||||
|
|
||||||
logger.info("=" * 80)
|
|
||||||
logger.info("start chat")
|
|
||||||
outputs = inference_engine.chat(conversation)
|
|
||||||
logger.info("chat result is ", outputs)
|
|
||||||
|
|
||||||
idx_list = []
|
|
||||||
idx_list_per_step = []
|
|
||||||
for i in range(2):
|
|
||||||
for j in range(4):
|
|
||||||
tokens = torch.randint(100, (10,))
|
|
||||||
idx_list_per_step.append(tokens.view(-1).cpu().numpy().tolist())
|
|
||||||
idx_list.extend(idx_list_per_step)
|
|
||||||
idx_list_per_step = []
|
|
||||||
logger.info(len(idx_list), [len(i) for i in idx_list])
|
|
||||||
|
|
||||||
logger.info("start test generate_sequences ")
|
|
||||||
outputs = inference_engine.generate_sequences(
|
|
||||||
idx_list=idx_list,
|
|
||||||
)
|
|
||||||
logger.info("generate_sequences output is:")
|
|
||||||
logger.info(outputs[0])
|
|
||||||
|
|
||||||
logger.info("input")
|
|
||||||
logger.info(idx_list[0])
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
@ -1,45 +1,54 @@
|
|||||||
#!/bin/bash
|
# #!/bin/bash
|
||||||
|
|
||||||
export CUDA_DEVICE_MAX_CONNECTIONS=1
|
# export CUDA_DEVICE_MAX_CONNECTIONS=1
|
||||||
SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
|
# SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
|
||||||
export PYTHONPATH=$SCRIPT_DIR/../../..:$PYTHONPATH
|
# export PYTHONPATH=$SCRIPT_DIR/../../..:$PYTHONPATH
|
||||||
GPUS_PER_NODE=8
|
# export VLLM_DP_SIZE=1
|
||||||
MASTER_ADDR=localhost
|
# export HCCL_BUFFSIZE=256
|
||||||
MASTER_PORT=6555
|
# export VLLM_USE_V1=1
|
||||||
NNODES=1
|
# export VLLM_VERSION=0.9.0
|
||||||
NODE_RANK=0
|
# export VLLM_ENABLE_GRAPH_MODE=0
|
||||||
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
|
# export VLLM_ENABLE_MC2=0
|
||||||
|
# export HCCL_OP_EXPANSION_MODE="AIV"
|
||||||
|
# export VLLM_ENABLE_TOPK_OPTIMZE=1
|
||||||
|
|
||||||
DISTRIBUTED_ARGS="
|
# GPUS_PER_NODE=8
|
||||||
--nproc_per_node $GPUS_PER_NODE \
|
# MASTER_ADDR=localhost
|
||||||
--nnodes $NNODES \
|
# MASTER_PORT=6555
|
||||||
--node_rank $NODE_RANK \
|
# NNODES=1
|
||||||
--master_addr $MASTER_ADDR \
|
# NODE_RANK=0
|
||||||
--master_port $MASTER_PORT
|
# WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
|
||||||
"
|
|
||||||
PYTHON_ARGS="
|
|
||||||
--model-path "/data/for_dt/weights/Qwen2.5-7B-mg" \
|
|
||||||
--tokenizer-path "/data/for_dt/weights/Qwen2.5-7B" \
|
|
||||||
--train-tp 4 \
|
|
||||||
--train-pp 2 \
|
|
||||||
--train-ep 1 \
|
|
||||||
--infer-tp 2 \
|
|
||||||
--infer-pp 1 \
|
|
||||||
--infer-ep 1
|
|
||||||
"
|
|
||||||
PYTHON_ARGS_new="
|
|
||||||
--model-path "/data/for_dt/weights/Qwen2.5-7B-tp2pp2" \
|
|
||||||
--tokenizer-path "/data/for_dt/weights/Qwen2.5-7B" \
|
|
||||||
--train-tp 2 \
|
|
||||||
--train-pp 2 \
|
|
||||||
--train-ep 1 \
|
|
||||||
--infer-tp 4 \
|
|
||||||
--infer-pp 1 \
|
|
||||||
--infer-ep 1
|
|
||||||
"
|
|
||||||
|
|
||||||
echo "start test_resharding st"
|
# DISTRIBUTED_ARGS="
|
||||||
|
# --nproc_per_node $GPUS_PER_NODE \
|
||||||
|
# --nnodes $NNODES \
|
||||||
|
# --node_rank $NODE_RANK \
|
||||||
|
# --master_addr $MASTER_ADDR \
|
||||||
|
# --master_port $MASTER_PORT
|
||||||
|
# "
|
||||||
|
# PYTHON_ARGS="
|
||||||
|
# --model-path "/data/for_dt/weights/Qwen2.5-7B-mg" \
|
||||||
|
# --tokenizer-path "/data/for_dt/weights/Qwen2.5-7B" \
|
||||||
|
# --train-tp 4 \
|
||||||
|
# --train-pp 2 \
|
||||||
|
# --train-ep 1 \
|
||||||
|
# --infer-tp 2 \
|
||||||
|
# --infer-pp 1 \
|
||||||
|
# --infer-ep 1
|
||||||
|
# "
|
||||||
|
# PYTHON_ARGS_new="
|
||||||
|
# --model-path "/data/for_dt/weights/Qwen2.5-7B-tp2pp2" \
|
||||||
|
# --tokenizer-path "/data/for_dt/weights/Qwen2.5-7B" \
|
||||||
|
# --train-tp 2 \
|
||||||
|
# --train-pp 2 \
|
||||||
|
# --train-ep 1 \
|
||||||
|
# --infer-tp 4 \
|
||||||
|
# --infer-pp 1 \
|
||||||
|
# --infer-ep 1
|
||||||
|
# "
|
||||||
|
|
||||||
torchrun $DISTRIBUTED_ARGS $SCRIPT_DIR/test_resharding.py $PYTHON_ARGS
|
# echo "start test_resharding st"
|
||||||
|
|
||||||
torchrun $DISTRIBUTED_ARGS $SCRIPT_DIR/test_resharding.py $PYTHON_ARGS_new
|
# torchrun $DISTRIBUTED_ARGS $SCRIPT_DIR/test_resharding.py $PYTHON_ARGS
|
||||||
|
|
||||||
|
# torchrun $DISTRIBUTED_ARGS $SCRIPT_DIR/test_resharding.py $PYTHON_ARGS_new
|
@ -306,6 +306,7 @@ class TestActor():
|
|||||||
train_tensor_parallel_size=args.train_tp,
|
train_tensor_parallel_size=args.train_tp,
|
||||||
train_pipeline_parallel_size=args.train_pp,
|
train_pipeline_parallel_size=args.train_pp,
|
||||||
train_expert_parallel_size=args.train_ep,
|
train_expert_parallel_size=args.train_ep,
|
||||||
|
train_context_parallel_size=args.train_cp,
|
||||||
infer_tensor_parallel_size=args.infer_tp,
|
infer_tensor_parallel_size=args.infer_tp,
|
||||||
infer_pipeline_parallel_size=args.infer_pp,
|
infer_pipeline_parallel_size=args.infer_pp,
|
||||||
infer_expert_parallel_size=args.infer_ep,
|
infer_expert_parallel_size=args.infer_ep,
|
||||||
@ -315,6 +316,7 @@ class TestActor():
|
|||||||
dtype="bfloat16",
|
dtype="bfloat16",
|
||||||
gpu_memory_utilization=0.6,
|
gpu_memory_utilization=0.6,
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
|
enforce_eager=True,
|
||||||
megatron_config=megatron_config
|
megatron_config=megatron_config
|
||||||
)
|
)
|
||||||
self.megatron_offloader = MegatronOffLoader(self.model, self.optimizer)
|
self.megatron_offloader = MegatronOffLoader(self.model, self.optimizer)
|
||||||
@ -364,6 +366,7 @@ def parse_args():
|
|||||||
parser.add_argument("--train-tp", type=int, default=2)
|
parser.add_argument("--train-tp", type=int, default=2)
|
||||||
parser.add_argument("--train-pp", type=int, default=2)
|
parser.add_argument("--train-pp", type=int, default=2)
|
||||||
parser.add_argument("--train-ep", type=int, default=1)
|
parser.add_argument("--train-ep", type=int, default=1)
|
||||||
|
parser.add_argument("--train_cp", type=int, default=1)
|
||||||
parser.add_argument("--infer-tp", type=int, default=4)
|
parser.add_argument("--infer-tp", type=int, default=4)
|
||||||
parser.add_argument("--infer-pp", type=int, default=1)
|
parser.add_argument("--infer-pp", type=int, default=1)
|
||||||
parser.add_argument("--infer-ep", type=int, default=1)
|
parser.add_argument("--infer-ep", type=int, default=1)
|
||||||
|
Reference in New Issue
Block a user