!309 【resharding】support EP+vpp+nooplayer

Merge pull request !309 from zhoubeirong/master
This commit is contained in:
zhoubeirong
2025-06-08 08:57:45 +00:00
committed by i-robot
parent a3438706bc
commit 02c296d922
28 changed files with 843 additions and 824 deletions

5
.gitignore vendored
View File

@ -157,4 +157,7 @@ cython_debug/
/kernel*/ /kernel*/
/logs/ /logs/
/model_from_hf/ /model_from_hf/
/model_weights/ /model_weights/
/outputs/
/vllm/
/vllm_ascend/

View File

@ -54,7 +54,11 @@ MindSpeed RL是基于昇腾生态的强化学习加速框架旨在为华为 [
</tr> </tr>
<tr> <tr>
<td>vllm</td> <td>vllm</td>
<td>0.7.3</td> <td>main</td>
</tr>
<tr>
<td>vllm-ascend</td>
<td>main</td>
</tr> </tr>
</table> </table>

View File

@ -1,244 +0,0 @@
import os
import argparse
import logging
import tensordict
import torch
import torch_npu
from torch_npu.contrib import transfer_to_npu
import vllm.distributed.parallel_state as ps
from mindspeed_rl.models.rollout.vllm_engine import VLLMInferEngine
from mindspeed_rl.utils.loggers import Loggers
logger = Loggers(
name="vllm_engine_inference",
)
def get_args():
parser = argparse.ArgumentParser()
group = parser.add_argument_group(title='inference args')
group.add_argument('--tokenizer-name-or-path', type=str,
help="Huggingface config path.")
group.add_argument('--load-format', type=str,
choices=["auto", "megatron"], default="auto",
help="Vllm weight load format, support auto from huggingface and from megatron format.")
group.add_argument('--load', type=str,
default=None,
help="Vllm weight path for megatron load format.")
group.add_argument('--tensor-parallel-size', type=int,
default=1,
help="infer tensor parallel size")
group.add_argument('--query', type=str, default="Write an essay about the importance of higher education.",
help='Input query.')
group.add_argument('--task', type=str,
choices=["generation", "chat"], default="chat",
help='Inference task, generation or chat.')
group.add_argument('--gpu-memory-utilization', type=float, default=0.9,
help='Device memory ratio allocated for vllm.')
group = parser.add_argument_group(title='distributed')
group.add_argument('--distributed-backend', default='nccl',
choices=['nccl', 'gloo'],
help='Which backend to use for distributed training.')
group.add_argument('--local-rank', type=int, default=int(os.getenv('LOCAL_RANK', '0')),
help='Local rank passed from distributed launcher.')
group.add_argument('--prompt-type', type=str, default=None,
choices=['default', 'empty', 'trl', 'qwen', 'qwen_r1', "qwen_math_r1", 'llama3', 'mistral', 'mixtral', 'gemma', 'llama2',
'alpaca', 'deepseek2', 'deepseek2-lite', 'minicpm3', 'cpm', 'baichuan2', 'deepseek3'],
help='Which template to use for constructing prompts in training/inference.' 'e.g., "qwen"')
group.add_argument('--prompt-type-path', type=str, default=None,
help='Path to the json file of templates.')
group = parser.add_argument_group(title='sampling params')
group.add_argument('--num-completions', type=int, default=1,
help='Number of output sequences to return for the given prompt.')
group.add_argument('--logprobs', type=int, default=1,
help='Number of log probabilities to return per output token.')
group.add_argument('--max-tokens', type=int, default=128,
help='Maximum number of tokens to generate per output sequence.')
group.add_argument('--top-p', type=float, default=1.0,
help='Float that controls the cumulative probability of the top tokens to consider.')
group.add_argument('--top-k', type=int, default=-1,
help='Integer that controls the number of top tokens to consider. Set to -1 to consider all tokens.')
group.add_argument('--temperature', type=float, default=1.0,
help='Float that controls the randomness of the sampling.')
return parser.parse_args()
def process_outputs(outputs):
res = ""
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
res = res + f"Prompt: {prompt!r}\nGenerated Text: {generated_text!r}\n"
res = res + "-" * 80
return res
def main():
logger.info("start vllm_engine inference")
args = get_args()
sampling_config = {
"num_completions": args.num_completions, # 每个输入提示生成的独立完成项数量
"logprobs": args.logprobs, # 返回的 top token 的对数概率数量
"max_tokens": args.max_tokens, # 生成输出的最大 token 数量
"top_p": args.top_p, # 核采样的累积概率阈值
"top_k": args.top_k, # 采样时考虑的最高概率 token 的数量
"temperature": args.temperature, # 控制预测随机性的温度参数
"detokenize": True # 是否将生成的 token 转换回可读字符串
}
inference_engine = VLLMInferEngine(
megatron_config=None,
sampling_config=sampling_config,
train_expert_parallel_size=1,
infer_expert_parallel_size=1,
tokenizer_name_or_path=args.tokenizer_name_or_path,
prompt_type=args.prompt_type,
prompt_type_path=args.prompt_type_path,
train_tensor_parallel_size=args.tensor_parallel_size,
train_pipeline_parallel_size=1,
infer_tensor_parallel_size=args.tensor_parallel_size,
infer_pipeline_parallel_size=1,
max_num_seqs=1,
gpu_memory_utilization=args.gpu_memory_utilization,
trust_remote_code=True,
load_format=args.load_format
)
if args.load_format == "megatron":
tp_rank = ps._TP.rank_in_group
weights_path = os.path.join(args.load, f"iter_0000001/mp_rank_{tp_rank:02}/model_optim_rng.pt")
actor_weights = torch.load(weights_path)['model']
actor_weights = replace_state_dict_name(
actor_weights,
vllm_dict=inference_engine.model.state_dict(),
arch=inference_engine.model.__class__.__name__)
logger.info("sync_model_weights")
inference_engine.sync_model_weights(actor_weights)
logger.info("init_cache_engine")
inference_engine.init_cache_engine()
if args.task == "chat":
chat_task(inference_engine, args.query)
elif args.task == "generation":
generate_task(inference_engine, args.query)
def chat_task(inference_engine, query):
conversation = [
{
"role": "user",
"content": query,
},
]
outputs = inference_engine.chat(conversation)
res = process_outputs(outputs)
logger.info('Query: {}'.format(query))
logger.info('Responses:\n{}'.format(res))
def generate_task(inference_engine, query):
outputs = inference_engine.llm.generate(
prompts=[query],
sampling_params=inference_engine.sampling_params,
)
res = process_outputs(outputs)
logger.info('Query: {}'.format(query))
logger.info('Responses:\n{}'.format(res))
def replace_state_dict_name(state_dict, vllm_dict, arch=None):
params_mapping = [
# (megatron core gpt model name, vllm model name)
("embedding.word_embeddings", "model.embed_tokens"),
("self_attention.linear_qkv", "self_attn.qkv_proj"),
("self_attention.linear_proj", "self_attn.o_proj"),
("input_layernorm", "input_layernorm"),
("pre_mlp_layernorm", "post_attention_layernorm"),
("mlp.linear_fc1", "mlp.gate_up_proj"),
("mlp.linear_fc2", "mlp.down_proj"),
("decoder.final_layernorm", "model.norm"),
("output_layer", "lm_head"),
# Deepseek add
("self_attention.linear_qb", "self_attn.q_b_proj"),
("self_attention.linear_kvb", "self_attn.kv_b_proj"),
("mlp.router.weight", "mlp.gate.weight"),
("mlp.router.expert_bias", "mlp.gate.e_score_correction_bias"),
("mlp.shared_experts.linear_fc1", "mlp.shared_experts.gate_up_proj"),
("mlp.shared_experts.linear_fc2", "mlp.shared_experts.down_proj"),
("mlp.experts.weight1", "mlp.experts.w13_weight"),
("mlp.experts.weight2", "mlp.experts.w2_weight"),
("self_attention.q_layernorm", "self_attn.q_a_layernorm"),
("self_attention.k_layernorm", "self_attn.kv_a_layernorm"),
]
new_state_dict = {}
for name, loaded_weight in state_dict.items():
if "_extra_state" in name:
continue
if "Deepseek" in arch:
name = _replace_name_m2v_deepseek(name, params_mapping)
else:
name = _replace_name_m2v(name, params_mapping)
# the router bias in raw weight in fp32
if "e_score_correction_bias" in name:
loaded_weight = loaded_weight.to(vllm_dict[name].dtype)
# to adapter 'copy_' in megatron weight loader to save memory
if "mlp.experts" in name:
loaded_weight = loaded_weight.view(vllm_dict[name].shape)
new_state_dict[name] = loaded_weight
return new_state_dict
def _replace_name_m2v(name, name_mapping):
"""
Transfer state dict names from megatron to vllm.
"""
for m_name, v_name in name_mapping:
if m_name not in name:
continue
if "layers" in name: # deal with decoder layers
name = name.replace("decoder", "model")
name_list = name.split(".")
if "layer_norm_weight" in name_list or "layer_norm_bias" in name_list:
param_name_list = name_list[:3]
param_name_list.append(v_name)
param_name = ".".join(param_name_list)
else:
param_name_list = name_list[:3]
weight_or_bias = name_list[-1]
param_name_list.append(v_name)
param_name_list.append(weight_or_bias)
param_name = ".".join(param_name_list)
return param_name
else:
param_name = name.replace(m_name, v_name)
return param_name
return name
def _replace_name_m2v_deepseek(name, name_mapping):
"""
Transfer state dict names from megatron to vllm.
"""
for m_name, v_name in name_mapping:
if m_name not in name:
continue
if "layers" in name: # deal with decoder layers
name = name.replace("decoder", "model")
param_name = name.replace(m_name, v_name)
return param_name
return name
if __name__ == "__main__":
main()

View File

@ -8,6 +8,15 @@ env_vars:
HCCL_IF_BASE_PORT: '48000' HCCL_IF_BASE_PORT: '48000'
CUDA_DEVICE_MAX_CONNECTIONS: '1' CUDA_DEVICE_MAX_CONNECTIONS: '1'
HYDRA_FULL_ERROR: '1' HYDRA_FULL_ERROR: '1'
VLLM_DP_SIZE: '1'
HCCL_BUFFSIZE: '256'
VLLM_USE_V1: '1'
VLLM_VERSION: '0.9.0'
VLLM_ENABLE_GRAPH_MODE: '0'
VLLM_ENABLE_MC2: '0'
HCCL_OP_EXPANSION_MODE: "AIV"
VLLM_ENABLE_TOPK_OPTIMZE: "1"
# GLOO_SOCKET_IFNAME: "Your SOCKET IFNAME" # GLOO_SOCKET_IFNAME: "Your SOCKET IFNAME"
# TP_SOCKET_IFNAME: "Your SOCKET IFNAME" # TP_SOCKET_IFNAME: "Your SOCKET IFNAME"
# HCCL_SOCKET_IFNAME: "Your SOCKET IFNAME" # HCCL_SOCKET_IFNAME: "Your SOCKET IFNAME"

View File

@ -79,6 +79,7 @@ rl_config:
guarantee_order: true guarantee_order: true
generate_config: generate_config:
enforce_eager: True
trust_remote_code: true trust_remote_code: true
offload_train_optimizer: true offload_train_optimizer: true
offload_train_grad: true offload_train_grad: true

View File

@ -80,6 +80,7 @@ rl_config:
num_npus: 16 num_npus: 16
generate_config: generate_config:
enforce_eager: True
trust_remote_code: true trust_remote_code: true
offload_train_optimizer: true offload_train_optimizer: true
offload_train_grad: true offload_train_grad: true

View File

@ -98,16 +98,16 @@ pip install apex-0.1.dev*.whl
```shell ```shell
git clone https://github.com/vllm-project/vllm.git git clone https://github.com/vllm-project/vllm.git
cd vllm cd vllm
git checkout v0.7.3 git checkout 5bc1ad6cee754405464a9957e86cf3a9302e4986
pip install -r requirements-build.txt pip install -r requirements-build.txt
VLLM_TARGET_DEVICE=empty pip install . VLLM_TARGET_DEVICE=empty pip install .
``` ```
### vllm_ascend安装 ### vllm_ascend安装
```shell ```shell
git clone -b v0.7.3-dev https://github.com/vllm-project/vllm-ascend.git git clone -b main https://github.com/vllm-project/vllm-ascend.git
cd vllm-ascend cd vllm-ascend
git checkout 0713836e95fe993feefe334945b5b273e4add1f1 git checkout 908a851
pip install -e . pip install -e .
``` ```

View File

@ -1,32 +0,0 @@
#!/bin/bash
export GLOO_SOCKET_IFNAME="Your SOCKET IFNAME"
export TP_SOCKET_IFNAME="Your SOCKET IFNAME"
export CUDA_DEVICE_MAX_CONNECTIONS=1
GPUS_PER_NODE=8
MASTER_ADDR="host ip"
MASTER_PORT=6001
NNODES=4
NODE_RANK="node rank"
DISTRIBUTED_ARGS="
--nproc_per_node $GPUS_PER_NODE \
--nnodes $NNODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT
"
INFER_ARGS="
--tokenizer-name-or-path 'your huggingface config path' \
--load-format megatron \
--load 'megatron weight path' \
--tensor-parallel-size 32 \
--task chat \
"
torchrun $DISTRIBUTED_ARGS cli/infer_vllm.py \
$INFER_ARGS \
--query "Write an essay about the importance of higher education." \
--distributed-backend nccl

View File

@ -25,6 +25,9 @@ class GenerateConfig(BaseConfig):
dtype: Data type for model weights. Default is "bfloat16". dtype: Data type for model weights. Default is "bfloat16".
gpu_memory_utilization: GPU memory utilization factor. Default is 0.5. gpu_memory_utilization: GPU memory utilization factor. Default is 0.5.
enforce_eager: Whether to always use eager-mode PyTorch. If True, we will disable ACL graph and always execute the model in eager mode.
If False, we will use ACL graph and eager execution in hybrid for maximal performance and flexibility.
sampling_config: Configuration for text generation sampling. Default values are set for various sampling parameters. sampling_config: Configuration for text generation sampling. Default values are set for various sampling parameters.
- num_completions: The number of independent completions to generate for each input prompt. Default is 1. - num_completions: The number of independent completions to generate for each input prompt. Default is 1.
- logprobs: The number of top tokens to return log probabilities for. Default is 1. - logprobs: The number of top tokens to return log probabilities for. Default is 1.
@ -72,6 +75,7 @@ class GenerateConfig(BaseConfig):
self.enable_prefix_caching = False self.enable_prefix_caching = False
self.num_scheduler_steps = 1 self.num_scheduler_steps = 1
self.enforce_eager = False
# 采样配置的默认值,用于生成文本时的采样策略设置 # 采样配置的默认值,用于生成文本时的采样策略设置
self.sampling_config = { self.sampling_config = {

View File

@ -119,6 +119,7 @@ class MegatronConfig(BaseConfig):
tensor_model_parallel_size: Size of tensor model parallelism (default: 1) tensor_model_parallel_size: Size of tensor model parallelism (default: 1)
pipeline_model_parallel_size: Size of pipeline model parallelism (default: 1) pipeline_model_parallel_size: Size of pipeline model parallelism (default: 1)
expert_model_parallel_size: Degree of expert model parallelism (default: 1) expert_model_parallel_size: Degree of expert model parallelism (default: 1)
num_layers_per_virtual_pipeline_stage: Degree of vpp (default: None)
lr: Learning rate (default: None) lr: Learning rate (default: None)
lr_decay_style: Learning rate decay style (default: 'linear') lr_decay_style: Learning rate decay style (default: 'linear')
min_lr: Minimum learning rate (default: 0.0) min_lr: Minimum learning rate (default: 0.0)
@ -198,6 +199,7 @@ class MegatronConfig(BaseConfig):
reuse_fp32_param: The distributed training optimizer frees up 'param copies of FP32 to save memory. (default: False) reuse_fp32_param: The distributed training optimizer frees up 'param copies of FP32 to save memory. (default: False)
moe_tp_extend_ep: use tp group to extend experts parallelism instead of sharding weight tensor of experts in tp group moe_tp_extend_ep: use tp group to extend experts parallelism instead of sharding weight tensor of experts in tp group
moe_alltoall_overlap_comm: moe_alltoall_overlap_comm moe_alltoall_overlap_comm: moe_alltoall_overlap_comm
noop_layers: noop layers string
''' '''
def __init__(self, training_config: Dict, model_config: Dict): def __init__(self, training_config: Dict, model_config: Dict):
@ -305,6 +307,7 @@ class MegatronConfig(BaseConfig):
self.tensor_model_parallel_size = 1 self.tensor_model_parallel_size = 1
self.pipeline_model_parallel_size = 1 self.pipeline_model_parallel_size = 1
self.expert_model_parallel_size = 1 self.expert_model_parallel_size = 1
self.num_layers_per_virtual_pipeline_stage = None
self.lr = None self.lr = None
self.lr_decay_style = 'linear' self.lr_decay_style = 'linear'
self.min_lr = 0.0 self.min_lr = 0.0
@ -358,5 +361,6 @@ class MegatronConfig(BaseConfig):
self.overlap_param_gather = False self.overlap_param_gather = False
self.recompute_activation_function = False self.recompute_activation_function = False
self.swap_attention = False self.swap_attention = False
self.noop_layers = None
self.update(training_config, model_config) self.update(training_config, model_config)

View File

@ -29,6 +29,7 @@ def validate_rl_args(
raise ValueError( raise ValueError(
f"integrated_mode_config should not be set when use_integrated_worker mode is off.") f"integrated_mode_config should not be set when use_integrated_worker mode is off.")
# 校验序列长度与模型最大长度 # 校验序列长度与模型最大长度
if generate_config.max_model_len < actor_config.seq_length: if generate_config.max_model_len < actor_config.seq_length:
raise ValueError( raise ValueError(

View File

@ -17,6 +17,7 @@ class BaseInferEngine(ABC):
prompt_type: str = None, prompt_type: str = None,
prompt_type_path: str = None, prompt_type_path: str = None,
train_expert_parallel_size: int = 1, train_expert_parallel_size: int = 1,
train_context_parallel_size: int = 1,
infer_tensor_parallel_size: int = 8, infer_tensor_parallel_size: int = 8,
infer_pipeline_parallel_size: int = 1, infer_pipeline_parallel_size: int = 1,
infer_expert_parallel_size: int = 1, infer_expert_parallel_size: int = 1,
@ -34,6 +35,7 @@ class BaseInferEngine(ABC):
train_tensor_parallel_size (int): Tensor parallel size during training. train_tensor_parallel_size (int): Tensor parallel size during training.
train_pipeline_parallel_size (int): Pipeline parallel size during training. train_pipeline_parallel_size (int): Pipeline parallel size during training.
train_expert_parallel_size (int): Expert parallel size during training. train_expert_parallel_size (int): Expert parallel size during training.
train_context_parallel_size (int): Context parallel size during training.
infer_tensor_parallel_size (int): Tensor parallel size during inference. infer_tensor_parallel_size (int): Tensor parallel size during inference.
infer_pipeline_parallel_size (int): Pipeline parallel size during inference. infer_pipeline_parallel_size (int): Pipeline parallel size during inference.
infer_expert_parallel_size (int): Expert parallel size during inference. infer_expert_parallel_size (int): Expert parallel size during inference.
@ -49,6 +51,7 @@ class BaseInferEngine(ABC):
self.train_tensor_parallel_size = train_tensor_parallel_size self.train_tensor_parallel_size = train_tensor_parallel_size
self.train_pipeline_parallel_size = train_pipeline_parallel_size self.train_pipeline_parallel_size = train_pipeline_parallel_size
self.train_expert_parallel_size = train_expert_parallel_size self.train_expert_parallel_size = train_expert_parallel_size
self.train_context_parallel_size = train_context_parallel_size
self.infer_tensor_parallel_size = infer_tensor_parallel_size self.infer_tensor_parallel_size = infer_tensor_parallel_size
self.infer_pipeline_parallel_size = infer_pipeline_parallel_size self.infer_pipeline_parallel_size = infer_pipeline_parallel_size
self.infer_expert_parallel_size = infer_expert_parallel_size self.infer_expert_parallel_size = infer_expert_parallel_size

View File

@ -92,6 +92,9 @@ class BaseTrainingEngine(ABC):
shuffle_mini_batch=self.shuffle_mini_batch) shuffle_mini_batch=self.shuffle_mini_batch)
n_micro_batch = len(batches) n_micro_batch = len(batches)
seq_len = batches[0]['input_ids'].shape[1] seq_len = batches[0]['input_ids'].shape[1]
data_iter = iter(batches)
if len(self.model) > 1:
data_iter = [iter(batches) for _ in self.model]
self.loss_func.add_loss_meta_info(self.get_loss_meta_func()) self.loss_func.add_loss_meta_info(self.get_loss_meta_func())
@ -104,7 +107,7 @@ class BaseTrainingEngine(ABC):
# batch should be a list of batches inside micro-batches # batch should be a list of batches inside micro-batches
losses_reduced = self.forward_backward_func( losses_reduced = self.forward_backward_func(
forward_step_func=forward_step, forward_step_func=forward_step,
data_iterator=iter(batches), data_iterator=data_iter,
model=self.model, model=self.model,
num_microbatches=n_micro_batch, num_microbatches=n_micro_batch,
seq_length=seq_len, seq_length=seq_len,

View File

@ -7,13 +7,6 @@ import torch
import torch.nn as nn import torch.nn as nn
from transformers.configuration_utils import PretrainedConfig from transformers.configuration_utils import PretrainedConfig
from vllm.model_executor.layers.linear import (
ColumnParallelLinear, MergedColumnParallelLinear, QKVParallelLinear,
RowParallelLinear, ReplicatedLinear)
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead, VocabParallelEmbedding
from vllm.model_executor.models import ModelRegistry
class InferParallelConfig: class InferParallelConfig:
def __init__(self, infer_tensor_parallel_size: int, infer_pipeline_parallel_size: int, infer_expert_parallel_size: int): def __init__(self, infer_tensor_parallel_size: int, infer_pipeline_parallel_size: int, infer_expert_parallel_size: int):
@ -88,14 +81,15 @@ def deepseek_megatron_weight_loader(actor_weights: Dict, vllm_model: nn.Module,
if name not in params_dict.keys(): if name not in params_dict.keys():
raise ValueError(f"unexpected key {name} in deepseek_megatron_weight_loader") raise ValueError(f"unexpected key {name} in deepseek_megatron_weight_loader")
if "mlp.experts.w13_weight" in name: if "mlp.experts.w13_weight" in name:
loaded_weight.copy_(loaded_weight.view(hf_config.n_routed_experts, hf_config.hidden_size, -1).transpose(2, 1).contiguous()) loaded_weight.copy_(loaded_weight.view(hf_config.n_routed_experts // infer_paralle_config.infer_expert_parallel_size, hf_config.hidden_size, -1).transpose(2, 1).contiguous())
if "mlp.experts.w2_weight" in name: if "mlp.experts.w2_weight" in name:
loaded_weight.copy_(loaded_weight.view(hf_config.n_routed_experts, -1, hf_config.hidden_size).transpose(2, 1).contiguous()) loaded_weight.copy_(loaded_weight.view(hf_config.n_routed_experts // infer_paralle_config.infer_expert_parallel_size, -1, hf_config.hidden_size).transpose(2, 1).contiguous())
load_single_weight(params_dict, name, loaded_weight) load_single_weight(params_dict, name, loaded_weight)
return vllm_model return vllm_model
def _get_model_weight_loader(arch: str): def _get_model_weight_loader(arch: str):
from vllm.model_executor.models import ModelRegistry
if arch in MODEL_MEGATRON_WEIGHT_LOADER_REGISTRY: if arch in MODEL_MEGATRON_WEIGHT_LOADER_REGISTRY:
return MODEL_MEGATRON_WEIGHT_LOADER_REGISTRY[arch] return MODEL_MEGATRON_WEIGHT_LOADER_REGISTRY[arch]
raise ValueError(f"Model architectures {arch} are not supported for now. " raise ValueError(f"Model architectures {arch} are not supported for now. "
@ -146,6 +140,23 @@ def load_single_weight(params_dict, name, loaded_weight):
def update_megatron_weight_loader(): def update_megatron_weight_loader():
from vllm.model_executor.layers.linear import (
ColumnParallelLinear, MergedColumnParallelLinear, QKVParallelLinear,
RowParallelLinear, ReplicatedLinear)
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead, VocabParallelEmbedding
LAYER_WEIGHT_MEGATRON_LOADER_REGISTRY = {
ColumnParallelLinear: parallel_weight_loader,
MergedColumnParallelLinear: parallel_weight_loader,
QKVParallelLinear: parallel_weight_loader,
RowParallelLinear: parallel_weight_loader,
VocabParallelEmbedding: parallel_weight_loader,
ParallelLMHead: parallel_weight_loader,
ReplicatedLinear: parallel_weight_loader,
FusedMoE: parallel_weight_loader
}
for layer_class, weight_loader in LAYER_WEIGHT_MEGATRON_LOADER_REGISTRY.items(): for layer_class, weight_loader in LAYER_WEIGHT_MEGATRON_LOADER_REGISTRY.items():
layer_class.weight_loader = weight_loader layer_class.weight_loader = weight_loader
@ -176,16 +187,6 @@ MODEL_MEGATRON_WEIGHT_LOADER_REGISTRY = {
"LlamaForCausalLM": llama_megatron_core_weight_loader, "LlamaForCausalLM": llama_megatron_core_weight_loader,
"Qwen2ForCausalLM": qwen_megatron_weight_loader, "Qwen2ForCausalLM": qwen_megatron_weight_loader,
"DeepseekV3ForCausalLM": deepseek_megatron_weight_loader, "DeepseekV3ForCausalLM": deepseek_megatron_weight_loader,
} "DeepseekV2ForCausalLM": deepseek_megatron_weight_loader,
"CustomDeepseekV3ForCausalLM": deepseek_megatron_weight_loader,
LAYER_WEIGHT_MEGATRON_LOADER_REGISTRY = {
ColumnParallelLinear: parallel_weight_loader,
MergedColumnParallelLinear: parallel_weight_loader,
QKVParallelLinear: parallel_weight_loader,
RowParallelLinear: parallel_weight_loader,
VocabParallelEmbedding: parallel_weight_loader,
ParallelLMHead: parallel_weight_loader,
ReplicatedLinear: parallel_weight_loader,
FusedMoE: parallel_weight_loader
} }

View File

@ -4,11 +4,18 @@
"""Model and data parallel groups.""" """Model and data parallel groups."""
import os import os
import re
import socket
import subprocess
from datetime import timedelta
from typing import Optional from typing import Optional
import torch import torch
import torch.distributed import torch.distributed as dist
import vllm.distributed.parallel_state as ps import vllm.distributed.parallel_state as ps
import vllm_ascend.distributed.parallel_state as ascend_ps
import vllm.envs as envs
from vllm.config import get_current_vllm_config
from vllm.distributed.parallel_state import ( from vllm.distributed.parallel_state import (
get_pp_group, get_pp_group,
@ -17,6 +24,10 @@ from vllm.distributed.parallel_state import (
init_model_parallel_group, init_model_parallel_group,
) )
from mindspeed_rl.utils.loggers import Loggers
logger = Loggers(__name__)
""" """
This version is strongly tied with Megatron to implement HybridEngine and weight sharing between vllm and Megatron. This version is strongly tied with Megatron to implement HybridEngine and weight sharing between vllm and Megatron.
@ -31,6 +42,12 @@ _DEVICE_MESH = None
_TP = None _TP = None
# Pipeline model parallel group that the current rank belongs to. # Pipeline model parallel group that the current rank belongs to.
_PP = None _PP = None
# Expert model parallel group that the current rank belongs to.
_EP = None
# Expert tensor model parallel group that the current rank belongs to.
_ETP = None
# Data model parallel group that the current rank belongs to.
_DP = None
# Tensor model parallel group # Tensor model parallel group
_TP_GROUP_RANKS = None _TP_GROUP_RANKS = None
@ -42,16 +59,20 @@ def get_vllm_tp_group_ranks():
# This method is for initializing the ParallelGroup when using HybridEngine # This method is for initializing the ParallelGroup when using HybridEngine
def initialize_parallel_state( def initialize_parallel_state(
distributed_init_method: str = "env://", distributed_init_method: str = "env://",
backend: str = "hccl", backend: str = "hccl",
infer_tensor_model_parallel_size: int = 1, infer_tensor_model_parallel_size: int = 1,
train_tensor_model_parallel_size: int = 1, train_tensor_model_parallel_size: int = 1,
infer_pipeline_model_parallel_size: int = 1, infer_pipeline_model_parallel_size: int = 1,
train_pipeline_model_parallel_size: int = 1 train_pipeline_model_parallel_size: int = 1,
infer_expert_tensor_parallel_size: int = 1,
train_expert_tensor_parallel_size: int = 1,
train_expert_model_parallel_size: int = 1,
infer_expert_model_parallel_size: int = 1,
train_context_model_parallel_size: int = 1,
): ):
os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1" os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
# NOTE(sgm): Modify for verl, Env vars will be set by TORCHRUN. # NOTE(sgm): Modify for verl, Env vars will be set by TORCHRUN.
rank = int(os.getenv("RANK", "-1")) rank = int(os.getenv("RANK", "-1"))
local_rank = int(os.getenv("LOCAL_RANK", "0")) local_rank = int(os.getenv("LOCAL_RANK", "0"))
@ -60,6 +81,8 @@ def initialize_parallel_state(
world_size = int(os.getenv("WORLD_SIZE", "-1")) world_size = int(os.getenv("WORLD_SIZE", "-1"))
if world_size == -1: if world_size == -1:
raise ValueError("The world_size is set to -1, not initialized by TORCHRUN") raise ValueError("The world_size is set to -1, not initialized by TORCHRUN")
config = get_current_vllm_config()
config.parallel_config.tensor_parallel_size = infer_tensor_model_parallel_size
init_distributed_environment(world_size, rank, distributed_init_method, local_rank, backend) init_distributed_environment(world_size, rank, distributed_init_method, local_rank, backend)
if torch.distributed.get_world_size() > 1: if torch.distributed.get_world_size() > 1:
# NOTE: build a sepearate inference group with infer tp & micro dp # NOTE: build a sepearate inference group with infer tp & micro dp
@ -67,54 +90,29 @@ def initialize_parallel_state(
infer_tensor_model_parallel_size=infer_tensor_model_parallel_size, infer_tensor_model_parallel_size=infer_tensor_model_parallel_size,
train_tensor_model_parallel_size=train_tensor_model_parallel_size, train_tensor_model_parallel_size=train_tensor_model_parallel_size,
infer_pipeline_model_parallel_size=infer_pipeline_model_parallel_size, infer_pipeline_model_parallel_size=infer_pipeline_model_parallel_size,
train_pipeline_model_parallel_size=train_pipeline_model_parallel_size train_pipeline_model_parallel_size=train_pipeline_model_parallel_size,
infer_expert_tensor_parallel_size=infer_expert_tensor_parallel_size,
train_expert_tensor_parallel_size=train_expert_tensor_parallel_size,
train_expert_model_parallel_size=train_expert_model_parallel_size,
infer_expert_model_parallel_size=infer_expert_model_parallel_size,
train_context_model_parallel_size=train_context_model_parallel_size
) )
else: else:
initialize_model_parallel(infer_tensor_model_parallel_size, infer_pipeline_model_parallel_size, backend) initialize_model_parallel(infer_tensor_model_parallel_size, infer_pipeline_model_parallel_size, backend)
def ensure_model_parallel_initialized(
tensor_model_parallel_size: int,
pipeline_model_parallel_size: int = 1,
backend: Optional[str] = None,
) -> None:
"""Helper to initialize model parallel groups if they are not initialized,
or ensure tensor-parallel and pipeline-parallel sizes are equal to expected
values if the model parallel groups are initialized.
"""
# get the backend of _DEVICE_WORLD_GROUP
backend = backend or torch.distributed.get_backend(get_world_group().device_group)
if not model_parallel_is_initialized():
initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size, backend)
return
current_tp_size = get_tensor_model_parallel_world_size()
if current_tp_size != tensor_model_parallel_size:
raise ValueError(
"tensor parallel group already initialized, but of unexpected size: "
f"{current_tp_size=} vs. "
f"{tensor_model_parallel_size=}"
)
pp_world_size = get_pp_group().world_size
if pp_world_size != pipeline_model_parallel_size:
raise ValueError(
"pipeline parallel group already initialized, but of unexpected size: "
f"{pp_world_size=} vs. "
f"{pipeline_model_parallel_size=}"
)
def model_parallel_is_initialized():
"""Check if tensor and pipeline parallel groups are initialized."""
return ps._TP is not None
# and _PIPELINE_MODEL_PARALLEL_GROUP is not None)
def initialize_model_parallel_for_vllm( def initialize_model_parallel_for_vllm(
infer_tensor_model_parallel_size: int, infer_tensor_model_parallel_size: int,
train_tensor_model_parallel_size: int = 1, train_tensor_model_parallel_size: int = 1,
infer_pipeline_model_parallel_size: int = 1, infer_pipeline_model_parallel_size: int = 1,
train_pipeline_model_parallel_size: int = 1 train_pipeline_model_parallel_size: int = 1,
infer_expert_tensor_parallel_size: int = 1,
train_expert_tensor_parallel_size: int = 1,
train_expert_model_parallel_size: int = 1,
infer_expert_model_parallel_size: int = 1,
train_context_model_parallel_size: int = 1,
num_process: int = 1,
rebulid_EP_group: bool = False
) -> None: ) -> None:
# Get world size and rank. Ensure some consistencies. # Get world size and rank. Ensure some consistencies.
@ -149,8 +147,10 @@ def initialize_model_parallel_for_vllm(
Returns: list of group_lists Returns: list of group_lists
[[g0, g1], [g2, g3], [g4, g5], [g6, g7]] [[g0, g1], [g2, g3], [g4, g5], [g6, g7]]
''' '''
if ((world_size // (train_tensor_model_parallel_size * train_pipeline_model_parallel_size)) * train_tensor_model_parallel_size < infer_tensor_model_parallel_size or if ((world_size // (
((world_size // (train_tensor_model_parallel_size * train_pipeline_model_parallel_size)) * train_tensor_model_parallel_size) % infer_tensor_model_parallel_size != 0): train_tensor_model_parallel_size * train_pipeline_model_parallel_size)) * train_tensor_model_parallel_size < infer_tensor_model_parallel_size or
((world_size // (
train_tensor_model_parallel_size * train_pipeline_model_parallel_size)) * train_tensor_model_parallel_size) % infer_tensor_model_parallel_size != 0):
raise ValueError( raise ValueError(
f"Can't split train tp size {train_tensor_model_parallel_size} to infer tp size {infer_tensor_model_parallel_size} " f"Can't split train tp size {train_tensor_model_parallel_size} to infer tp size {infer_tensor_model_parallel_size} "
f"with train dp size {(world_size // (train_tensor_model_parallel_size * train_pipeline_model_parallel_size))}.") f"with train dp size {(world_size // (train_tensor_model_parallel_size * train_pipeline_model_parallel_size))}.")
@ -179,16 +179,13 @@ def initialize_model_parallel_for_vllm(
[[g0, g2], [g1, g3], [g4, g6], [g5, g7]] [[g0, g2], [g1, g3], [g4, g6], [g5, g7]]
''' '''
if train_tensor_model_parallel_size < infer_tensor_model_parallel_size or train_tensor_model_parallel_size % infer_tensor_model_parallel_size != 0: if train_tensor_model_parallel_size < infer_tensor_model_parallel_size or train_tensor_model_parallel_size % infer_tensor_model_parallel_size != 0:
raise ValueError(f"Can't gather train tp size {train_tensor_model_parallel_size} to infer tp size {infer_tensor_model_parallel_size}") raise ValueError(
f"Can't gather train tp size {train_tensor_model_parallel_size} to infer tp size {infer_tensor_model_parallel_size}")
num_tensor_model_parallel_groups = world_size // infer_tensor_model_parallel_size num_tensor_model_parallel_groups = world_size // infer_tensor_model_parallel_size
num_tensor_model_parallel_groups_per_train_tp = train_tensor_model_parallel_size // infer_tensor_model_parallel_size
group_ranks = [] group_ranks = []
for i in range(num_tensor_model_parallel_groups // num_tensor_model_parallel_groups_per_train_tp): for i in range(num_tensor_model_parallel_groups):
start = train_tensor_model_parallel_size * i ranks = list(range(i * infer_tensor_model_parallel_size, (i + 1) * infer_tensor_model_parallel_size))
end = train_tensor_model_parallel_size * (i + 1) group_ranks.append(ranks)
for j in range(num_tensor_model_parallel_groups_per_train_tp):
ranks = list(range(start + j, end, num_tensor_model_parallel_groups_per_train_tp))
group_ranks.append(ranks)
return group_ranks return group_ranks
@ -201,9 +198,11 @@ def initialize_model_parallel_for_vllm(
_TP_GROUP_RANKS = tp_group_ranks _TP_GROUP_RANKS = tp_group_ranks
return tp_group_ranks return tp_group_ranks
tp_group_ranks = get_tp_group_ranks()
logger.info(f"TP rank: {tp_group_ranks}")
_TP = init_model_parallel_group( _TP = init_model_parallel_group(
group_ranks=get_tp_group_ranks(), group_ranks=tp_group_ranks,
local_rank=get_world_group().local_rank, local_rank=get_world_group().local_rank,
backend=backend, backend=backend,
use_message_queue_broadcaster=True, use_message_queue_broadcaster=True,
@ -218,16 +217,134 @@ def initialize_model_parallel_for_vllm(
ranks = list(range(i, world_size, num_pipeline_model_parallel_groups)) ranks = list(range(i, world_size, num_pipeline_model_parallel_groups))
group_ranks.append(ranks) group_ranks.append(ranks)
# pipeline parallel does not need custom allreduce # pipeline parallel does not need custom allreduce
logger.info(f"PP rank: {group_ranks}")
_PP = init_model_parallel_group( _PP = init_model_parallel_group(
group_ranks, get_world_group().local_rank, backend, group_ranks, get_world_group().local_rank, backend,
) )
ps._PP = _PP # for verl ps._PP = _PP # for verl
data_parallel_size = 1
from vllm.config import get_current_vllm_config
config = get_current_vllm_config()
if config is not None:
data_parallel_size = config.parallel_config.data_parallel_size
num_expert_parallel_groups: int = infer_expert_tensor_parallel_size
num_expert_tensor_parallel_groups: int = world_size // infer_expert_tensor_parallel_size
num_rank_per_process = world_size // num_process
all_ranks = list(range(world_size))
global _EP
assert _EP is None, ("expert parallel group is already initialized")
group_ranks = []
if rebulid_EP_group:
# 重新建组
tensor_model_parallel_size = train_tensor_model_parallel_size
context_parallel_size = train_context_model_parallel_size
expert_model_parallel_size = train_expert_model_parallel_size
train_data_parallel_size = world_size // tensor_model_parallel_size // train_pipeline_model_parallel_size
tensor_and_data_group_size_with_cp: int = tensor_model_parallel_size * train_data_parallel_size * context_parallel_size
num_tensor_and_data_groups_with_cp: int = world_size // tensor_and_data_group_size_with_cp
num_expert_groups: int = train_data_parallel_size * context_parallel_size // expert_model_parallel_size
tensor_and_expert_group_size = tensor_model_parallel_size * expert_model_parallel_size
all_tensor_and_expert_group_ranks = []
for i in range(num_tensor_and_data_groups_with_cp):
for j in range(num_expert_groups):
start_rank = i * tensor_and_data_group_size_with_cp + j * tensor_and_expert_group_size
end_rank = i * tensor_and_data_group_size_with_cp + (j + 1) * tensor_and_expert_group_size
ranks = range(start_rank, end_rank)
all_tensor_and_expert_group_ranks.append(list(ranks))
train_all_tensor_and_expert_group_ranks_tensor = torch.tensor(all_tensor_and_expert_group_ranks)
# 将训练态的EPG按照推理EP进行转置
infer_actual_expert_model_parallel_size = infer_tensor_model_parallel_size * infer_expert_model_parallel_size
experts_memory_expend_N = infer_actual_expert_model_parallel_size // tensor_and_expert_group_size
ep_group_num = world_size // tensor_and_expert_group_size
group_ranks = []
for i in range(0, ep_group_num, experts_memory_expend_N):
per_ep_group = train_all_tensor_and_expert_group_ranks_tensor[i:i + experts_memory_expend_N]
per_ep_group_T = per_ep_group.T
ranks = per_ep_group_T.reshape(-1).tolist()
group_ranks.append(ranks)
logger.info(f"EP rank: {group_ranks}")
else:
# 保序
group_ranks = []
tensor_model_parallel_size = infer_tensor_model_parallel_size
context_parallel_size = 1
expert_model_parallel_size = infer_expert_model_parallel_size
infer_data_parallel_size = world_size // tensor_model_parallel_size // infer_pipeline_model_parallel_size
tensor_and_data_group_size_with_cp: int = tensor_model_parallel_size * infer_data_parallel_size * context_parallel_size
num_tensor_and_data_groups_with_cp: int = world_size // tensor_and_data_group_size_with_cp
num_expert_groups: int = infer_data_parallel_size * context_parallel_size // expert_model_parallel_size
tensor_and_expert_group_size = tensor_model_parallel_size * expert_model_parallel_size
group_ranks = []
for i in range(num_tensor_and_data_groups_with_cp):
for j in range(num_expert_groups):
start_rank = i * tensor_and_data_group_size_with_cp + j * tensor_and_expert_group_size
end_rank = i * tensor_and_data_group_size_with_cp + (j + 1) * tensor_and_expert_group_size
ranks = range(start_rank, end_rank)
group_ranks.append(list(ranks))
logger.info(f"EP rank: {group_ranks}")
ascend_ps._EP = init_model_parallel_group(group_ranks,
get_world_group().local_rank,
backend,
group_name="ep")
global _ETP
assert _ETP is None, (
"expert tensor parallel group is already initialized")
group_ranks = []
for i in range(num_expert_tensor_parallel_groups):
ranks = list(range(i * infer_expert_tensor_parallel_size,
(i + 1) * infer_expert_tensor_parallel_size))
group_ranks.append(ranks)
logger.info(f"ETP rank: {group_ranks}")
ascend_ps._ETP = init_model_parallel_group(group_ranks,
get_world_group().local_rank,
backend,
group_name="etp")
if data_parallel_size > 1:
global _DP
assert _DP is None, ("data parallel group is already initialized")
dp_group_ranks = torch.tensor(tp_group_ranks).transpose(0, 1).reshape(-1, data_parallel_size).unbind(0)
group_ranks = [x.tolist() for x in dp_group_ranks]
logger.info(f"DP rank: {group_ranks}")
ps._DP = init_model_parallel_group(group_ranks,
get_world_group().local_rank,
backend,
group_name="dp")
os.environ["VLLM_DP_RANK"] = str(ps._DP.rank_in_group)
envs.VLLM_DP_RANK = int(os.environ["VLLM_DP_RANK"])
ip_list = get_cluster_info()
for index, group_rank in enumerate(group_ranks):
if torch.distributed.get_rank() in group_rank:
os.environ["VLLM_DP_MASTER_PORT"] = str(
int(os.environ.get("MASTER_PORT")) + 1 + index)
os.environ["VLLM_DP_MASTER_IP"] = ip_list[group_rank[0]]
envs.VLLM_DP_MASTER_IP = os.environ["VLLM_DP_MASTER_IP"]
envs.VLLM_DP_MASTER_PORT = int(os.environ["VLLM_DP_MASTER_PORT"])
os.environ["VLLM_PORT"] = os.environ["VLLM_DP_MASTER_PORT"]
envs.VLLM_PORT = envs.VLLM_DP_MASTER_PORT
logger.info(f"rank: {torch.distributed.get_rank()}>>>>>>VLLM_DP_MASTER_IP: {envs.VLLM_DP_MASTER_IP}, VLLM_DP_MASTER_PORT: {envs.VLLM_DP_MASTER_PORT}")
def initialize_model_parallel( def initialize_model_parallel(
tensor_model_parallel_size: int = 1, tensor_model_parallel_size: int = 1,
pipeline_model_parallel_size: int = 1, pipeline_model_parallel_size: int = 1,
backend: Optional[str] = None, backend: Optional[str] = None,
) -> None: ) -> None:
""" """
NOTE: This method is a hack from the open-sourced version without NOTE: This method is a hack from the open-sourced version without
@ -260,8 +377,6 @@ def initialize_model_parallel(
world_size: int = torch.distributed.get_world_size() world_size: int = torch.distributed.get_world_size()
backend = backend or torch.distributed.get_backend(ps.get_world_group().device_group) backend = backend or torch.distributed.get_backend(ps.get_world_group().device_group)
num_tensor_model_parallel_groups: int = world_size // tensor_model_parallel_size num_tensor_model_parallel_groups: int = world_size // tensor_model_parallel_size
global _TP global _TP
if _TP is not None: if _TP is not None:
@ -295,3 +410,60 @@ def initialize_model_parallel(
ps._PP = _PP # for verl ps._PP = _PP # for verl
def get_cluster_info():
# 确保分布式环境已初始化
if not dist.is_initialized():
raise RuntimeError("Distributed environment not initialized")
world_size = dist.get_world_size()
# 获取当前节点的IP地址
ip_address = _get_current_node_ip()
# 收集所有rank的IP地址
ip_list = [None] * world_size
dist.all_gather_object(ip_list, ip_address)
return ip_list
def _get_current_node_ip() -> str:
try:
# 创建一个 UDP 套接字(仅用于获取接口信息)
with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
# 连接到一个外部地址(无需真实通信)
s.connect(("8.8.8.8", 80)) # Google DNS 服务器
local_ip = s.getsockname()[0]
except Exception:
local_ip = _get_ip_by_ifname()
if not local_ip:
# 如果失败,回退到遍历接口
local_ip = "127.0.0.1"
hostname = socket.gethostname()
for addr in socket.getaddrinfo(hostname, None):
ip = addr[4][0]
if not ip.startswith("::"):
local_ip = ip
break
return local_ip
def _get_ip_by_ifname():
"""
通过接口名称(如 eth0、en0获取 IPv4 地址
返回 IP 字符串,失败返回 None
"""
try:
# 执行 ifconfig 命令并捕获输出
ifname = os.environ.get("HCCL_SOCKET_IFNAME", 0)
if ifname:
output = subprocess.check_output(["ifconfig", ifname], stderr=subprocess.STDOUT).decode()
# 正则匹配 IPv4 地址(排除 127.0.0.1
matches = re.findall(r'inet (?:addr:)?((?:\d{1,3}\.){3}\d{1,3})', output)
for ip in matches:
if ip != "127.0.0.1":
return ip
return None
except subprocess.CalledProcessError:
return None

View File

@ -9,10 +9,10 @@ import gc
import ray import ray
import torch import torch
import torch.distributed import torch.distributed
import torch_npu
from torch_npu.contrib import transfer_to_npu
from torch.nn.utils.rnn import pad_sequence from torch.nn.utils.rnn import pad_sequence
from transformers import AutoConfig from transformers import AutoConfig
from vllm import LLM, SamplingParams
from mindspeed_rl.utils.utils import mstx_timer_decorator
def dummy_compile(*compile_args, **compile_kwargs): def dummy_compile(*compile_args, **compile_kwargs):
@ -23,14 +23,14 @@ def dummy_compile(*compile_args, **compile_kwargs):
return wrapper return wrapper
return decorate return decorate
torch.jit.script = dummy_compile
torch.compile = dummy_compile torch.compile = dummy_compile
torch.jit.script = dummy_compile
from vllm import LLM, SamplingParams
from vllm.v1.core.kv_cache_utils import get_kv_cache_config, unify_kv_cache_configs
from mindspeed_rl.utils.loggers import Loggers from mindspeed_rl.utils.loggers import Loggers
from mindspeed_rl.models.base.base_inference_engine import BaseInferEngine from mindspeed_rl.models.base.base_inference_engine import BaseInferEngine
from mindspeed_rl.config_cls.megatron_config import MegatronConfig
from mindspeed_rl.models.rollout.vllm_adapter.vllm_parallel_state import initialize_parallel_state from mindspeed_rl.models.rollout.vllm_adapter.vllm_parallel_state import initialize_parallel_state
from mindspeed_rl.models.rollout.vllm_adapter.megatron_weight_loaders import ( from mindspeed_rl.models.rollout.vllm_adapter.megatron_weight_loaders import (
load_megatron_weights, load_megatron_weights,
@ -49,11 +49,12 @@ class VLLMInferEngine(BaseInferEngine):
train_tensor_parallel_size: int, train_tensor_parallel_size: int,
train_pipeline_parallel_size: int, train_pipeline_parallel_size: int,
train_expert_parallel_size: int, train_expert_parallel_size: int,
train_context_parallel_size: int,
infer_tensor_parallel_size: int, infer_tensor_parallel_size: int,
infer_pipeline_parallel_size: int, infer_pipeline_parallel_size: int,
infer_expert_parallel_size: int, infer_expert_parallel_size: int,
megatron_config: MegatronConfig,
sampling_config: dict, sampling_config: dict,
infer_expert_tensor_parallel_size: int = 1,
prompt_type: str = None, prompt_type: str = None,
prompt_type_path: str = None, prompt_type_path: str = None,
enable_prefix_caching: bool = False, enable_prefix_caching: bool = False,
@ -64,6 +65,7 @@ class VLLMInferEngine(BaseInferEngine):
gpu_memory_utilization: float = 0.5, gpu_memory_utilization: float = 0.5,
trust_remote_code: bool = True, trust_remote_code: bool = True,
load_format: str = "megatron", load_format: str = "megatron",
enforce_eager: bool = False,
**kwargs **kwargs
): ):
""" """
@ -74,10 +76,12 @@ class VLLMInferEngine(BaseInferEngine):
train_tensor_parallel_size (int): Tensor parallel size during training. train_tensor_parallel_size (int): Tensor parallel size during training.
train_pipeline_parallel_size (int): Pipeline parallel size during training. train_pipeline_parallel_size (int): Pipeline parallel size during training.
train_expert_parallel_size (int): Expert parallel size during training. train_expert_parallel_size (int): Expert parallel size during training.
train_context_parallel_size (int): Context parallel size during training.
infer_tensor_parallel_size (int): Tensor parallel size during inference. infer_tensor_parallel_size (int): Tensor parallel size during inference.
infer_pipeline_parallel_size (int): Pipeline parallel size during inference. infer_pipeline_parallel_size (int): Pipeline parallel size during inference.
infer_expert_parallel_size (int): Expert parallel size during inference. infer_expert_parallel_size (int): Expert parallel size during inference.
sampling_config (dict): Configuration for text generation sampling. sampling_config (dict): Configuration for text generation sampling.
infer_expert_tensor_parallel_size (int): Expert tensor parallel size during inference.
enable_prefix_caching (bool): Whether to enable prefix caching. enable_prefix_caching (bool): Whether to enable prefix caching.
num_scheduler_steps (int): Num scheduler steps. Default is 1. num_scheduler_steps (int): Num scheduler steps. Default is 1.
max_num_seqs (int): Maximum number of sequences to process simultaneously. Default is 1. max_num_seqs (int): Maximum number of sequences to process simultaneously. Default is 1.
@ -95,6 +99,7 @@ class VLLMInferEngine(BaseInferEngine):
train_tensor_parallel_size=train_tensor_parallel_size, train_tensor_parallel_size=train_tensor_parallel_size,
train_pipeline_parallel_size=train_pipeline_parallel_size, train_pipeline_parallel_size=train_pipeline_parallel_size,
train_expert_parallel_size=train_expert_parallel_size, train_expert_parallel_size=train_expert_parallel_size,
train_context_parallel_size=train_context_parallel_size,
infer_tensor_parallel_size=infer_tensor_parallel_size, infer_tensor_parallel_size=infer_tensor_parallel_size,
infer_pipeline_parallel_size=infer_pipeline_parallel_size, infer_pipeline_parallel_size=infer_pipeline_parallel_size,
infer_expert_parallel_size=infer_expert_parallel_size, infer_expert_parallel_size=infer_expert_parallel_size,
@ -105,6 +110,12 @@ class VLLMInferEngine(BaseInferEngine):
trust_remote_code=trust_remote_code trust_remote_code=trust_remote_code
) )
# Additional initialization logic for VLLMInferEngine # Additional initialization logic for VLLMInferEngine
torch.compile = dummy_compile
# vLLM Ascend must be patched in advance
from vllm_ascend.patch import platform
from vllm_ascend.patch import worker
# Initialize sampling parameters from SamplingConfig # Initialize sampling parameters from SamplingConfig
self.sampling_config = sampling_config self.sampling_config = sampling_config
try: try:
@ -112,13 +123,11 @@ class VLLMInferEngine(BaseInferEngine):
n=sampling_config.get('num_completions', 1), n=sampling_config.get('num_completions', 1),
logprobs=sampling_config.get('logprobs', 1), logprobs=sampling_config.get('logprobs', 1),
max_tokens=sampling_config.get('max_tokens', 128), max_tokens=sampling_config.get('max_tokens', 128),
best_of=sampling_config.get('best_of', 2),
top_p=sampling_config.get('top_p', 1.0), top_p=sampling_config.get('top_p', 1.0),
top_k=sampling_config.get('top_k', 50), top_k=sampling_config.get('top_k', 50),
min_p=sampling_config.get('min_p', 0.0), min_p=sampling_config.get('min_p', 0.0),
temperature=sampling_config.get('temperature', 0.2), temperature=sampling_config.get('temperature', 0.2),
detokenize=sampling_config.get('detokenize', False), detokenize=sampling_config.get('detokenize', False)
seed=sampling_config.get('seed', None)
) )
except Exception as e: except Exception as e:
raise ValueError(f"Error creating SamplingParams from dictionary") from e raise ValueError(f"Error creating SamplingParams from dictionary") from e
@ -139,7 +148,6 @@ class VLLMInferEngine(BaseInferEngine):
# Initialize parallel state if tensor parallel size is specified # Initialize parallel state if tensor parallel size is specified
if train_tensor_parallel_size is not None: if train_tensor_parallel_size is not None:
num_tp_per_train_tp = train_tensor_parallel_size // infer_tensor_parallel_size
os.environ['CUDA_TIMER_STREAM_KAFKA_ENABLE'] = '0' os.environ['CUDA_TIMER_STREAM_KAFKA_ENABLE'] = '0'
os.environ['MEGATRON_IMPORT_TIMERS'] = '0' os.environ['MEGATRON_IMPORT_TIMERS'] = '0'
initialize_parallel_state( initialize_parallel_state(
@ -147,32 +155,39 @@ class VLLMInferEngine(BaseInferEngine):
train_tensor_model_parallel_size=train_tensor_parallel_size, train_tensor_model_parallel_size=train_tensor_parallel_size,
infer_pipeline_model_parallel_size=infer_pipeline_parallel_size, infer_pipeline_model_parallel_size=infer_pipeline_parallel_size,
train_pipeline_model_parallel_size=train_pipeline_parallel_size, train_pipeline_model_parallel_size=train_pipeline_parallel_size,
train_expert_model_parallel_size=train_expert_parallel_size,
infer_expert_model_parallel_size=infer_expert_parallel_size,
train_context_model_parallel_size=train_context_parallel_size
) )
if load_format == "megatron": if load_format == "megatron":
update_megatron_weight_loader() update_megatron_weight_loader()
torch.jit.script = dummy_compile
torch.compile = dummy_compile
# Initialize the LLM engine # Initialize the LLM engine
self.llm = LLM( self.llm = LLM(
seed=1234,
model=tokenizer_name_or_path, model=tokenizer_name_or_path,
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
tensor_parallel_size=infer_tensor_parallel_size, tensor_parallel_size=infer_tensor_parallel_size,
load_format="dummy" if load_format == "megatron" else "auto", load_format='dummy' if load_format == 'megatron' else load_format,
distributed_executor_backend="external_launcher", distributed_executor_backend="external_launcher",
enable_prefix_caching=enable_prefix_caching, enable_prefix_caching=enable_prefix_caching,
num_scheduler_steps=num_scheduler_steps, num_scheduler_steps=num_scheduler_steps,
dtype=dtype, dtype=dtype,
enforce_eager=False, enforce_eager=enforce_eager,
skip_tokenizer_init=False, skip_tokenizer_init=False,
gpu_memory_utilization=gpu_memory_utilization, gpu_memory_utilization=gpu_memory_utilization,
max_num_seqs=max_num_seqs, max_num_seqs=max_num_seqs,
max_model_len=max_model_len max_model_len=max_model_len,
additional_config={
'expert_tensor_parallel_size': infer_expert_tensor_parallel_size,
'enable_graph_mode': int(os.environ.get('VLLM_ENABLE_GRAPH_MODE', '0')),
'ascend_scheduler_config': {},
}
) )
self.model = self.llm.llm_engine.model_executor.driver_worker.worker.model_runner.get_model() self.model = self.llm.llm_engine.model_executor.driver_worker.worker.model_runner.get_model()
self.kv_cache_configs = None
self.cpu_model = {} self.cpu_model = {}
for name, params in self.model.named_parameters(): for name, params in self.model.named_parameters():
@ -180,14 +195,61 @@ class VLLMInferEngine(BaseInferEngine):
if load_format == "megatron": if load_format == "megatron":
self.free_cache_engine() self.free_cache_engine()
if os.environ['VLLM_USE_V1'] == '1':
self._initialize_kv_caches(self.llm.llm_engine.vllm_config)
self.offload_model_weights() self.offload_model_weights()
from vllm.config import VllmConfig
def _initialize_kv_caches(self, vllm_config: VllmConfig):
# Get all kv cache needed by the model
kv_cache_specs = self.llm.llm_engine.engine_core.engine_core.model_executor.get_kv_cache_specs()
# Profiles the peak memory usage of the model to determine how much
# memory can be allocated for kv cache.
available_gpu_memory = self.llm.llm_engine.engine_core.engine_core.model_executor.determine_available_memory()
assert len(kv_cache_specs) == len(available_gpu_memory)
# Get the kv cache tensor size
self.kv_cache_configs = [
get_kv_cache_config(vllm_config, kv_cache_spec_one_worker,
available_gpu_memory_one_worker)
for kv_cache_spec_one_worker, available_gpu_memory_one_worker in
zip(kv_cache_specs, available_gpu_memory)
]
# Since we use a shared centralized controller, we need the
# `kv_cache_config` to be consistent across all workers to make sure
# all the memory operators can be applied to all workers.
unify_kv_cache_configs(self.kv_cache_configs)
# All workers have the same kv_cache_config except layer names, so use
# an arbitrary one to initialize the scheduler.
assert all([
cfg.num_blocks == self.kv_cache_configs[0].num_blocks
for cfg in self.kv_cache_configs
])
def init_cache_engine(self): def init_cache_engine(self):
if self.llm.llm_engine.model_executor.driver_worker.worker.cache_engine is None: if os.environ['VLLM_USE_V1'] == '1':
self.llm.llm_engine.model_executor.driver_worker.worker._init_cache_engine() worker = self.llm.llm_engine.model_executor.driver_worker.worker
if not worker.model_runner.kv_caches:
# v1 使用显式初始化方法
self.llm.llm_engine.engine_core.engine_core.model_executor.initialize_from_config(
self.kv_cache_configs)
else:
if self.llm.llm_engine.model_executor.driver_worker.worker.cache_engine is None:
self.llm.llm_engine.model_executor.driver_worker.worker._init_cache_engine()
def free_cache_engine(self): def free_cache_engine(self):
ctx = self.llm.llm_engine.model_executor.driver_worker.worker.compilation_config.static_forward_context if os.environ['VLLM_USE_V1'] == '1':
worker = self.llm.llm_engine.model_executor.driver_worker.worker
ctx = worker.model_runner.vllm_config.compilation_config.static_forward_context
else:
ctx = self.llm.llm_engine.model_executor.driver_worker.worker.compilation_config.static_forward_context
from vllm.attention import AttentionType from vllm.attention import AttentionType
layer_need_kv_cache = [] layer_need_kv_cache = []
@ -201,10 +263,14 @@ class VLLMInferEngine(BaseInferEngine):
for _ in range(pipeline_parallel_size): for _ in range(pipeline_parallel_size):
kv_cache.append(torch.tensor([])) kv_cache.append(torch.tensor([]))
ctx[layer_name].kv_cache = kv_cache ctx[layer_name].kv_cache = kv_cache
if os.environ['VLLM_USE_V1'] == '1':
worker = self.llm.llm_engine.model_executor.driver_worker.worker
self.llm.llm_engine.model_executor.driver_worker.worker.cache_engine = None # 清理缓存引擎
self.llm.llm_engine.model_executor.driver_worker.worker.gpu_cache = None worker.model_runner.kv_caches = []
else:
self.llm.llm_engine.model_executor.driver_worker.worker.cache_engine = None
self.llm.llm_engine.model_executor.driver_worker.worker.gpu_cache = None
if hasattr(self.model.model.layers[0].self_attn, "attn"): if hasattr(self.model.model.layers[0].self_attn, "attn"):
for i in range(self.model.model.start_layer, self.model.model.end_layer): for i in range(self.model.model.start_layer, self.model.model.end_layer):
attn_impl = self.model.model.layers[i].self_attn.attn.impl attn_impl = self.model.model.layers[i].self_attn.attn.impl
@ -212,18 +278,26 @@ class VLLMInferEngine(BaseInferEngine):
attn_impl.key_cache = None attn_impl.key_cache = None
attn_impl.value_cache = None attn_impl.value_cache = None
self.gpu_cache = None
gc.collect() gc.collect()
torch.cuda.empty_cache() torch.cuda.empty_cache()
def offload_model_weights(self): def offload_model_weights(self):
for name, params in self.model.named_parameters(): for name, params in self.model.named_parameters():
params.data = self.cpu_model[name] params.data = self.cpu_model[name]
if hasattr(self.model.model.layers[-1].self_attn, "mla_attn"):
for i in range(self.model.model.start_layer, self.model.model.end_layer):
mla = self.model.model.layers[i].self_attn.mla_attn.impl
if hasattr(mla, "w_kc"):
mla.w_kc = None
mla.w_vc = None
if hasattr(mla, "W_UV"):
mla.W_UV = None
mla.W_UK_T = None
def sync_model_weights(self, params, load_format='megatron'): def sync_model_weights(self, params, load_format='megatron'):
infer_parallel_config = InferParallelConfig(self.infer_tensor_parallel_size, self.infer_pipeline_parallel_size, infer_parallel_config = InferParallelConfig(self.infer_tensor_parallel_size, self.infer_pipeline_parallel_size,
self.infer_expert_parallel_size) self.infer_expert_parallel_size * self.infer_tensor_parallel_size)
load_megatron_weights(params, load_megatron_weights(params,
self.model, self.model,
infer_parallel_config, infer_parallel_config,
@ -237,9 +311,12 @@ class VLLMInferEngine(BaseInferEngine):
if hasattr(mla, "w_kc"): if hasattr(mla, "w_kc"):
mla.w_kc = None mla.w_kc = None
mla.w_vc = None mla.w_vc = None
if hasattr(mla, "W_UV"):
mla.W_UV = None
mla.W_UK_T = None
mla.process_weights_after_loading(None)
@torch.no_grad() @torch.no_grad()
@mstx_timer_decorator
def generate_sequences(self, idx_list, **kwargs): def generate_sequences(self, idx_list, **kwargs):
self.init_cache_engine() self.init_cache_engine()
with self.update_sampling_params(**kwargs): with self.update_sampling_params(**kwargs):
@ -325,4 +402,4 @@ def get_local_rank() -> int:
# Default to 0 (for testing or single-process scenarios) # Default to 0 (for testing or single-process scenarios)
logger.warning("Warning: Unable to determine local rank. Defaulting to 0.") logger.warning("Warning: Unable to determine local rank. Defaulting to 0.")
return 0 return 0

View File

@ -259,8 +259,8 @@ class _HuggingFaceTokenizer(BaseTokenizer):
def tokenize(self, text): def tokenize(self, text):
return self.tokenizer(text).input_ids return self.tokenizer(text).input_ids
def detokenize(self, token_ids): def detokenize(self, token_ids, **kwargs):
return self.tokenizer.decode(token_ids) return self.tokenizer.decode(token_ids, **kwargs)
@property @property
def eod(self): def eod(self):

View File

@ -88,6 +88,9 @@ class ActorHybridWorkerBase(BaseWorker):
self.inference_model = self._build_rollout() self.inference_model = self._build_rollout()
self.sharding_manager = self._build_sharding_manager() self.sharding_manager = self._build_sharding_manager()
if self.generate_config.offload_train_param:
self.actor_offloader.onload_param()
self.actor_hybrid = ActorRolloutHybrid( self.actor_hybrid = ActorRolloutHybrid(
self.model, self.model,
@ -178,7 +181,7 @@ class ActorHybridWorkerBase(BaseWorker):
self.args.consumed_train_samples += self.megatron_config.global_batch_size // self.rl_config.n_samples_per_prompt self.args.consumed_train_samples += self.megatron_config.global_batch_size // self.rl_config.n_samples_per_prompt
self.num_floating_point_operations_so_far += num_floating_point_operations(self.args, self.num_floating_point_operations_so_far += num_floating_point_operations(self.args,
self.megatron_config.global_batch_size) self.megatron_config.global_batch_size)
if self.parallel_state.is_pipeline_last_stage() and self.parallel_state.get_tensor_model_parallel_rank() == 0: if self.parallel_state.is_pipeline_last_stage(ignore_virtual=True) and self.parallel_state.get_tensor_model_parallel_rank() == 0:
ray.get(self.td.update_metrics.remote(value=metrics, cumulate=True)) ray.get(self.td.update_metrics.remote(value=metrics, cumulate=True))
ray.get( ray.get(
self.td.update_metrics.remote( self.td.update_metrics.remote(
@ -371,6 +374,7 @@ class ActorHybridWorkerBase(BaseWorker):
train_tensor_parallel_size=self.megatron_config.tensor_model_parallel_size, train_tensor_parallel_size=self.megatron_config.tensor_model_parallel_size,
train_pipeline_parallel_size=self.megatron_config.pipeline_model_parallel_size, train_pipeline_parallel_size=self.megatron_config.pipeline_model_parallel_size,
train_expert_parallel_size=self.megatron_config.expert_model_parallel_size, train_expert_parallel_size=self.megatron_config.expert_model_parallel_size,
train_context_parallel_size=self.megatron_config.context_parallel_size,
infer_tensor_parallel_size=self.generate_config.infer_tensor_parallel_size, infer_tensor_parallel_size=self.generate_config.infer_tensor_parallel_size,
infer_pipeline_parallel_size=self.generate_config.infer_pipeline_parallel_size, infer_pipeline_parallel_size=self.generate_config.infer_pipeline_parallel_size,
infer_expert_parallel_size=self.generate_config.infer_expert_parallel_size, infer_expert_parallel_size=self.generate_config.infer_expert_parallel_size,
@ -382,9 +386,9 @@ class ActorHybridWorkerBase(BaseWorker):
max_model_len=self.generate_config.max_model_len, max_model_len=self.generate_config.max_model_len,
dtype=self.generate_config.dtype, dtype=self.generate_config.dtype,
gpu_memory_utilization=self.generate_config.gpu_memory_utilization, gpu_memory_utilization=self.generate_config.gpu_memory_utilization,
trust_remote_code=self.generate_config.trust_remote_code trust_remote_code=self.generate_config.trust_remote_code,
enforce_eager=self.generate_config.enforce_eager,
) )
return rollout return rollout
def _build_sharding_manager(self): def _build_sharding_manager(self):
@ -404,7 +408,8 @@ class ActorHybridWorkerBase(BaseWorker):
grad_offload=self.generate_config.offload_train_grad, grad_offload=self.generate_config.offload_train_grad,
train_param_offload=self.generate_config.offload_train_param, train_param_offload=self.generate_config.offload_train_param,
enable_validate=self.rl_config.enable_sharding_validate, enable_validate=self.rl_config.enable_sharding_validate,
megatron_offloader=self.actor_offloader megatron_offloader=self.actor_offloader,
noop_layers=self.megatron_config.noop_layers
) )
return sharding_manager return sharding_manager

View File

@ -21,20 +21,27 @@ Manager used to shard weight and offload/onload optimizer from training stage to
from itertools import chain from itertools import chain
from collections import defaultdict from collections import defaultdict
import os
import torch import torch
import torch.distributed import torch.distributed as dist
import vllm.distributed.parallel_state as ps
from mindspeed_rl.utils.loggers import Loggers
from mindspeed_rl.workers.resharding.vllm_weight_container import MegatronStyleVllmWeightContainer from mindspeed_rl.workers.resharding.vllm_weight_container import MegatronStyleVllmWeightContainer
from mindspeed_rl.workers.resharding.weight_adaptor import get_weight_adaptor from mindspeed_rl.workers.resharding.weight_adaptor import get_weight_adaptor
from mindspeed_rl.utils.utils import mstx_timer_decorator from mindspeed_rl.utils.utils import mstx_timer_decorator
logger = Loggers(
name="vllm_engine_inference",
)
class MegatronOffLoader: class MegatronOffLoader:
def __init__(self, megatron_model=None, optimizer=None, wrap_with_ddp=True): def __init__(self, megatron_model=None, optimizer=None, wrap_with_ddp=True):
self.optimizer = optimizer self.optimizer = optimizer
self.model = megatron_model self.model = megatron_model
self.wrap_with_ddp = wrap_with_ddp self.wrap_with_ddp = wrap_with_ddp
self.tensor_to_cpu_states_map = dict() self.tensor_to_cpu_states_map = dict()
@mstx_timer_decorator @mstx_timer_decorator
@ -51,18 +58,29 @@ class MegatronOffLoader:
@mstx_timer_decorator @mstx_timer_decorator
def offload_optimizer(self): def offload_optimizer(self):
for param_group in self.optimizer.optimizer.param_groups: if hasattr(self.optimizer, "chained_optimizers"):
for param in param_group['params']: optimizers = self.optimizer.chained_optimizers
param.data = param.data.to("cpu", non_blocking=False) else:
self.optimizer.optimizer.state = self._move_to_device(self.optimizer.optimizer.state, "cpu") optimizers = [self.optimizer]
for optimizer in optimizers:
for param_group in optimizer.optimizer.param_groups:
for param in param_group['params']:
param.data = param.data.to("cpu", non_blocking=False)
optimizer.optimizer.state = self._move_to_device(optimizer.optimizer.state,
"cpu")
@mstx_timer_decorator @mstx_timer_decorator
def onload_optimizer(self): def onload_optimizer(self):
for param_group in self.optimizer.optimizer.param_groups: if hasattr(self.optimizer, "chained_optimizers"):
for param in param_group['params']: optimizers = self.optimizer.chained_optimizers
param.data = param.data.to(torch.cuda.current_device(), non_blocking=False) else:
self.optimizer.optimizer.state = self._move_to_device(self.optimizer.optimizer.state, optimizers = [self.optimizer]
torch.cuda.current_device()) for optimizer in optimizers:
for param_group in optimizer.optimizer.param_groups:
for param in param_group['params']:
param.data = param.data.to(torch.cuda.current_device(), non_blocking=False)
optimizer.optimizer.state = self._move_to_device(optimizer.optimizer.state,
torch.cuda.current_device())
@mstx_timer_decorator @mstx_timer_decorator
def _move_to_device(self, data, device): def _move_to_device(self, data, device):
@ -133,7 +151,8 @@ class MegatronShardingManager:
num_layer_list=None, num_layer_list=None,
moe_tp_extend_ep=None, moe_tp_extend_ep=None,
parallel_state=None, parallel_state=None,
megatron_offloader=None megatron_offloader=None,
noop_layers=None
): ):
"""Megatron Sharding Manager initialization. """Megatron Sharding Manager initialization.
@ -169,13 +188,13 @@ class MegatronShardingManager:
moe_tp_extend_ep=moe_tp_extend_ep, moe_tp_extend_ep=moe_tp_extend_ep,
parallel_state=parallel_state, parallel_state=parallel_state,
weight_adaptor=self.weight_adaptor, weight_adaptor=self.weight_adaptor,
enable_validate=enable_validate) enable_validate=enable_validate,
noop_layers=noop_layers)
self.optimizer_offload = optimizer_offload self.optimizer_offload = optimizer_offload
self.grad_offload = grad_offload self.grad_offload = grad_offload
self.train_param_offload = train_param_offload self.train_param_offload = train_param_offload
self.enable_validate = enable_validate self.enable_validate = enable_validate
self.use_distributed_optimizer = self.optimizer.config.use_distributed_optimizer
self.inference_engine.offload_model_weights() self.inference_engine.offload_model_weights()
self.megatron_offloader = megatron_offloader self.megatron_offloader = megatron_offloader
@ -206,8 +225,6 @@ class MegatronShardingManager:
3. do resharding 3. do resharding
4. offload training param 4. offload training param
""" """
if self.train_param_offload:
self.megatron_offloader.onload_param()
self.onload_infer_params() self.onload_infer_params()
@ -215,9 +232,9 @@ class MegatronShardingManager:
if self.train_param_offload: if self.train_param_offload:
self.megatron_offloader.offload_param() self.megatron_offloader.offload_param()
self.inference_engine.sync_model_weights(infer_params, load_format='megatron') self.inference_engine.sync_model_weights(infer_params, load_format='megatron')
@mstx_timer_decorator @mstx_timer_decorator
def exit_infer_mode(self): def exit_infer_mode(self):
""" """

View File

@ -71,7 +71,7 @@ class MemoryBuffer:
if param_name not in self.tensor_indices: if param_name not in self.tensor_indices:
raise KeyError(f"Parameter {param_name} not found in the buffer.") raise KeyError(f"Parameter {param_name} not found in the buffer.")
start_index, shape = self.tensor_indices[param_name] start_index, shape = self.tensor_indices[param_name] # weight_name -- index shape
return self.get(shape, start_index) return self.get(shape, start_index)
@ -82,6 +82,15 @@ def calc_padded_numel(shape: torch.Size, dtype: torch.dtype):
return (numel + align_numel - 1) // align_numel * align_numel return (numel + align_numel - 1) // align_numel * align_numel
# 构建EP增大的buffer———构造一个experts_weight_buffer_meta
def get_weight_buffer_meta_from_buffer(weight_buffer_meta) -> Dict[str, Dict]:
experts_weight_buffer_meta = {}
for name, meta_info in sorted(weight_buffer_meta.items()):
if "mlp.experts" in name:
experts_weight_buffer_meta[name] = meta_info
return experts_weight_buffer_meta
def build_memory_buffer(weight_buffer_meta: Dict[str, Dict]) -> Dict[torch.dtype, MemoryBuffer]: def build_memory_buffer(weight_buffer_meta: Dict[str, Dict]) -> Dict[torch.dtype, MemoryBuffer]:
"""Build the memory buffer given weight_buffer_meta """Build the memory buffer given weight_buffer_meta
@ -123,8 +132,61 @@ def build_memory_buffer(weight_buffer_meta: Dict[str, Dict]) -> Dict[torch.dtype
return memory_buffers return memory_buffers
def build_experts_memory_buffer(experts_weight_buffer_meta: Dict[str, Dict], experts_memory_expend_N) -> Dict[torch.dtype, MemoryBuffer]:
"""Build the experts memory buffer given experts_weight_buffer_meta
Args:
weight_buffer_meta: contains mapping from name to a dictionary containing shape and dtype of the tensors
Returns: a large memory buffer for each dtype that can hold all the tensors
"""
experts_memory_buffers = {}
total_numel_map = {} # map from dtype to the total numel
for _, meta_info in sorted(experts_weight_buffer_meta.items()):
shape = meta_info['shape']
shape = torch.Size([experts_memory_expend_N, shape[0], shape[1], shape[2]])
dtype = meta_info['dtype']
if not isinstance(shape, torch.Size):
raise TypeError("Shape must be an instance of torch.Size")
if not isinstance(dtype, torch.dtype):
raise TypeError("dtype must be an instance of torch.dtype")
if dtype not in total_numel_map:
total_numel_map[dtype] = 0
tmp_numel = calc_padded_numel(shape, dtype)
total_numel_map[dtype] += tmp_numel
for dtype, total_numel in total_numel_map.items():
# Create a buffer for each dtype with the total numel
experts_memory_buffers[dtype] = MemoryBuffer(total_numel, total_numel, dtype)
# Now, insert each tensor's index and shape for later retrieval by name
current_index_map = {} # This keeps track of the current memory index for each dtype
for name, meta_info in sorted(experts_weight_buffer_meta.items()):
shape = meta_info['shape']
shape = torch.Size([experts_memory_expend_N, shape[0], shape[1], shape[2]])
dtype = meta_info['dtype']
buffer = experts_memory_buffers[dtype]
tensor_size = calc_padded_numel(shape, dtype)
start_index = current_index_map.get(dtype, 0)
current_index_map[dtype] = start_index + tensor_size
buffer.tensor_indices[name] = (start_index, shape)
return experts_memory_buffers
def build_model_weight_buffer(model: nn.Module, names_per_pp: List[str], get_weight_buffer_meta): def build_model_weight_buffer(model: nn.Module, names_per_pp: List[str], get_weight_buffer_meta):
memory_buffers = [ModelWeightBuffer(model, weight_names, get_weight_buffer_meta) for weight_names in names_per_pp] combined_names_per_pp = [[] for _ in names_per_pp]
for pp_rank, vpp_stages in enumerate(names_per_pp):
for weight_names_per_stage in vpp_stages:
combined_names_per_pp[pp_rank].extend(weight_names_per_stage)
memory_buffers = [ModelWeightBuffer(model, weight_names, get_weight_buffer_meta) for weight_names in combined_names_per_pp]
return memory_buffers return memory_buffers
@ -139,7 +201,7 @@ class ModelWeightBuffer:
self.weight_buffer_meta = self.get_weight_buffer_meta(self.model, weight_names) self.weight_buffer_meta = self.get_weight_buffer_meta(self.model, weight_names)
self.weight_names = list(self.weight_buffer_meta.keys()) self.weight_names = list(self.weight_buffer_meta.keys())
self.memory_buffers = None self.memory_buffers = None
# self.memory_buffers = build_memory_buffer(self.weight_buffer_meta)
def __getitem__(self, weight_name: str) -> torch.Tensor: def __getitem__(self, weight_name: str) -> torch.Tensor:
return self.get_weight_by_name(weight_name) return self.get_weight_by_name(weight_name)

View File

@ -25,12 +25,16 @@ import torch.distributed as dist
import numpy as np import numpy as np
from torch.distributed import new_group from torch.distributed import new_group
import vllm.distributed.parallel_state as ps
from mindspeed_rl.workers.resharding.memory_buffer import build_model_weight_buffer from mindspeed_rl.workers.resharding.memory_buffer import build_model_weight_buffer, calc_padded_numel
import mindspeed_rl.workers.resharding.utils import mindspeed_rl.workers.resharding.utils
from mindspeed_rl.workers.resharding.utils import get_tensor_parallel_partition_dim, tp_md5_validate, \ from mindspeed_rl.workers.resharding.utils import get_tensor_parallel_partition_dim, tp_md5_validate, \
update_md5_by_rank, compute_md5, validate_md5, _build_infer_param_dict, get_tp_allgather_group, \ update_md5_by_rank, compute_md5, validate_md5, _build_infer_param_dict, get_tp_allgather_group, \
get_tp_allgather_world_size, is_tensor_parallel_param, get_tp_group, is_fake_tp_param get_tp_allgather_world_size, is_tensor_parallel_param, get_tp_group, is_fake_tp_param
from mindspeed_rl.utils.loggers import Loggers
logger = Loggers(__name__)
class MegatronStyleVllmWeightContainer: class MegatronStyleVllmWeightContainer:
@ -42,7 +46,8 @@ class MegatronStyleVllmWeightContainer:
moe_tp_extend_ep=False, moe_tp_extend_ep=False,
parallel_state=None, parallel_state=None,
weight_adaptor=None, weight_adaptor=None,
enable_validate=False) -> None: enable_validate=False,
noop_layers=None) -> None:
""" Megatron style vllm weight container. """ Megatron style vllm weight container.
Arguments: Arguments:
@ -64,16 +69,26 @@ class MegatronStyleVllmWeightContainer:
self.megatron_model = megatron_model self.megatron_model = megatron_model
self.parallel_state = parallel_state self.parallel_state = parallel_state
self.weight_adaptor = weight_adaptor self.weight_adaptor = weight_adaptor
self._num_hidden_layers = self.model_config.num_hidden_layers self._num_hidden_layers = self.model_config.num_hidden_layers # 通过tokenier路径下的config.json获取hf的模型
self._noop_layers = None
if noop_layers is not None:
self._noop_layers = [int(layer_idx) for layer_idx in noop_layers.split(',')]
self._num_hidden_layers += len(self._noop_layers)
# pp configs # pp configs
self._pp_rank = self.parallel_state.get_pipeline_model_parallel_rank() self._pp_rank = self.parallel_state.get_pipeline_model_parallel_rank()
self._pp_group = self.parallel_state.get_pipeline_model_parallel_group() self._pp_group = self.parallel_state.get_pipeline_model_parallel_group()
self._pp_size = self.parallel_state.get_pipeline_model_parallel_world_size() self._pp_size = self.parallel_state.get_pipeline_model_parallel_world_size()
self._world_size = dist.get_world_size()
self.pp_group_size = self._world_size // self._pp_size
## vpp
self._num_layer_list = self._build_num_layer_list(num_layer_list) self._num_layer_list = self._build_num_layer_list(num_layer_list)
self._vpp_size = self.parallel_state._VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK if self.parallel_state._VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK else 1 self._vpp_rank = self.parallel_state._VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK if self.parallel_state._VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK else 0
self._vpp_rank = self.parallel_state._VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE if self.parallel_state._VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE else 0 self._vpp_size = self.parallel_state._VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE if self.parallel_state._VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE else 1
self._vpp_layer_list = self._build_vpp_layer_list(self._num_layer_list)
## _noop_layers
self._global2local_map = self._build_global2local_map(self._vpp_layer_list, self._vpp_size, self._noop_layers) if self._noop_layers is not None else None
# tp configs # tp configs
self._tp_size = self.parallel_state.get_tensor_model_parallel_world_size() self._tp_size = self.parallel_state.get_tensor_model_parallel_world_size()
self._tp_group = self.parallel_state.get_tensor_model_parallel_group() self._tp_group = self.parallel_state.get_tensor_model_parallel_group()
@ -97,7 +112,11 @@ class MegatronStyleVllmWeightContainer:
self._infer_ep_size = infer_expert_parallel_size self._infer_ep_size = infer_expert_parallel_size
self.moe_tp_extend_ep = moe_tp_extend_ep self.moe_tp_extend_ep = moe_tp_extend_ep
self._world_size = dist.get_world_size() # TODO: infer_expert_tensor_parallel_size and num_process is fixed.
self.infer_expert_tensor_parallel_size = 1
self.num_process = 1
self._infer_ep_size = self._infer_ep_size * self._infer_tp_size
self.experts_memory_expend_N = self._infer_ep_size // self._ep_size
# validate parallel configs # validate parallel configs
self._validate_parallel_config() self._validate_parallel_config()
@ -116,10 +135,9 @@ class MegatronStyleVllmWeightContainer:
def _validate_parallel_config(self): def _validate_parallel_config(self):
if self._infer_pp_size != 1: if self._infer_pp_size != 1:
raise ValueError("infer_pp_size != 1 not supported yet") raise ValueError("infer_pp_size != 1 not supported yet")
if self._infer_ep_size != 1:
raise ValueError("infer_ep_size != 1 not supported yet") if self._infer_ep_size % self._ep_size != 0:
if self._ep_size > 1 and self._ep_size != self._infer_tp_size: raise ValueError("The training expert size should be divisibled by the inference expert size.")
raise ValueError("For training EP, supports EP -> TP only currently.")
if self._ep_size > 1 and not self.moe_tp_extend_ep: if self._ep_size > 1 and not self.moe_tp_extend_ep:
raise ValueError("To enable training EP, you need to enable moe_tp_extend_ep and use GroupedMLP.") raise ValueError("To enable training EP, you need to enable moe_tp_extend_ep and use GroupedMLP.")
if self._pp_size < self._infer_pp_size: if self._pp_size < self._infer_pp_size:
@ -149,6 +167,12 @@ class MegatronStyleVllmWeightContainer:
self._update_weight_buffers_intra_pp() self._update_weight_buffers_intra_pp()
self._update_weight_buffers_inter_pp() self._update_weight_buffers_inter_pp()
# 执行_update_weight_buffers_ep+_send_receive_experts的前提条件
if(self.moe_tp_extend_ep and self._infer_ep_size >= self._ep_size):
self._update_weight_buffers_ep()
self._send_receive_experts()
params = self._get_all_params() params = self._get_all_params()
params = _build_infer_param_dict(params=params) params = _build_infer_param_dict(params=params)
@ -161,27 +185,55 @@ class MegatronStyleVllmWeightContainer:
raise ValueError("num_layers % pp_size == 0, please specify num_layer_list") raise ValueError("num_layers % pp_size == 0, please specify num_layer_list")
return [self._num_hidden_layers // self._pp_size for _ in range(self._pp_size)] return [self._num_hidden_layers // self._pp_size for _ in range(self._pp_size)]
def _build_vpp_layer_list(self, num_layer_list):
if self._vpp_size <= 1:
return num_layer_list
for layers_in_pp_rank in num_layer_list:
if layers_in_pp_rank % self._vpp_size != 0:
raise ValueError("num_layers_per_pp % vpp_size != 0, please specify pp_size and vpp_size")
return [int(layers_in_pp_rank / self._vpp_size) for layers_in_pp_rank in num_layer_list]
def _build_global2local_map(self, layer_list, vpp_size, noop_layers):
stage_layers_num = sum(layer_list)
glb2local_map = []
for vpp_rank in range(vpp_size):
start_layer = vpp_rank * stage_layers_num
for _, layers_in_vpp_rank in enumerate(layer_list):
layer_idx_list = [
layer_idx for layer_idx in range(start_layer, start_layer + layers_in_vpp_rank)
if layer_idx not in noop_layers
]
glb2local_map += [layer_idx % layers_in_vpp_rank for layer_idx in layer_idx_list]
start_layer += layers_in_vpp_rank
return glb2local_map
def _unwrap_megatron_model(self, model): def _unwrap_megatron_model(self, model):
""" """
Remove consecutive 'module.' prefixes from the model based on the state_dict's first key. Remove consecutive 'module.' prefixes from the model based on the state_dict's first key.
This method only removes 'module.' from the beginning of the key and ignores other occurrences. This method only removes 'module.' from the beginning of the key and ignores other occurrences.
""" """
model = model[0] unwraped_model = []
first_key = list(dict(model.named_parameters()).keys())[0] for model_chunk in model:
while first_key.startswith("module."): first_key = list(dict(model_chunk.named_parameters()).keys())[0]
model = model.module while first_key.startswith("module."):
first_key = first_key[len("module."):] # 更新键,去掉一个module. model_chunk = model_chunk.module
return model first_key = first_key[len("module."):]
unwraped_model.append(model_chunk)
return unwraped_model
def _init_weight_buffers(self): def _init_weight_buffers(self):
""" """
Build buffers from vllm state dict. Totally build train pp_size buffers, each buffer corresponds to a pack of megatron weight. Build buffers from vllm state dict. Totally build train pp_size buffers, each buffer corresponds to a pack of megatron weight.
Return a list of buffers, and a reference dict megatron_param_name->buffer. Return a list of buffers, and a reference dict megatron_param_name->buffer.
""" """
vllm_names = list(dict(self.vllm_model.named_parameters()).keys()) vllm_names = list(dict(self.vllm_model.named_parameters()).keys()) # 获取每个pp内部的weights name
self.weight_names_per_pp = self.weight_adaptor.get_weight_names_per_pp(self._num_layer_list, vllm_names) self.weight_names_per_pp = self.weight_adaptor.get_weight_names_per_pp(self._vpp_layer_list, vllm_names,
sum(self._num_layer_list), self._vpp_size, self._noop_layers)
self.weight_buffers = build_model_weight_buffer(self.vllm_model, self.weight_names_per_pp, self.weight_buffers = build_model_weight_buffer(self.vllm_model, self.weight_names_per_pp,
self.weight_adaptor.get_weight_buffer_meta) self.weight_adaptor.get_weight_buffer_meta
)
def trans_ep_params_to_tp(self, megatron_param, name): def trans_ep_params_to_tp(self, megatron_param, name):
""" """
@ -264,7 +316,11 @@ class MegatronStyleVllmWeightContainer:
async_op=False async_op=False
) )
total_experts = self.num_local_experts * tp_size total_experts = self.num_local_experts * tp_size
return torch.cat(output_tensor_list, dim=1).reshape(hidden_size, total_experts, -1).permute(1, 0, 2) res = torch.cat(output_tensor_list, dim=1).reshape(hidden_size, total_experts, -1)
if 'weight2' in name:
return res.permute(1, 2, 0).contiguous()
return res.permute(1, 0, 2).contiguous()
def _update_weight_buffers_intra_pp(self): def _update_weight_buffers_intra_pp(self):
""" """
@ -281,35 +337,107 @@ class MegatronStyleVllmWeightContainer:
return infer_param return infer_param
pp_rank = self._pp_rank pp_rank = self._pp_rank
weight_buffer = self.weight_buffers[pp_rank] weight_names = self.weight_names_per_pp[pp_rank]
weight_names_meta = self.weight_adaptor.convert_weight_name_meta(weight_names)
true_megatron_model = self._unwrap_megatron_model(self.megatron_model) true_megatron_model = self._unwrap_megatron_model(self.megatron_model)
normal_layer_func = partial(self.weight_adaptor.global2local_layer, num_layer_list=self._num_layer_list) normal_layer_func = partial(self.weight_adaptor.global2local_layer, num_layer_list=self._vpp_layer_list, global2local_map=self._global2local_map)
name_pairs = sorted(list(set([(name, self.weight_adaptor.replace_name_i2t(normal_layer_func(name))) name_pairs = sorted(list(set([(name, vpp_rank, self.weight_adaptor.replace_name_i2t(normal_layer_func(name, vpp_rank=vpp_rank)))
for name in weight_buffer.weight_names]))) for vpp_rank, names_per_vpp in enumerate(weight_names_meta) for name in names_per_vpp])))
if self.enable_validate: if self.enable_validate:
self.origin_params_for_md5 = hashlib.md5() self.origin_params_for_md5 = hashlib.md5()
self.infer_params_for_md5 = [hashlib.md5() for _ in range(get_tp_allgather_world_size())] self.infer_params_for_md5 = [hashlib.md5() for _ in range(get_tp_allgather_world_size())]
for hf_name, megatron_name in name_pairs:
# 检查 linear_fc1 和 linear_fc2 权重形状是否符合特定关系fc1 包含门控和扩展参数,因此大小是 fc2 的两倍)。不符合条件的模型不被支持。
for _, vpp_rank, megatron_name in name_pairs:
if megatron_name.endswith("linear_fc1.weight"): if megatron_name.endswith("linear_fc1.weight"):
fc2_name = megatron_name.replace("linear_fc1", "linear_fc2") fc2_name = megatron_name.replace("linear_fc1", "linear_fc2")
megatron_param_fc1 = dict(true_megatron_model.named_parameters())[megatron_name] megatron_param_fc1 = dict(true_megatron_model[vpp_rank].named_parameters())[megatron_name]
megatron_param_fc2 = dict(true_megatron_model.named_parameters())[fc2_name] megatron_param_fc2 = dict(true_megatron_model[vpp_rank].named_parameters())[fc2_name]
if megatron_param_fc1.shape[0] * megatron_param_fc1.shape[1] != megatron_param_fc2.shape[0] * \ if megatron_param_fc1.shape[0] * megatron_param_fc1.shape[1] != megatron_param_fc2.shape[0] * \
megatron_param_fc2.shape[1] * 2: megatron_param_fc2.shape[1] * 2:
raise ValueError("Only implemented for Llama model which linear_fc1 contains gate and up params.") raise ValueError("Only implemented for Llama model which linear_fc1 contains gate and up params.")
megatron_params_dict = dict(true_megatron_model.named_buffers()) weight_buffer = self.weight_buffers[pp_rank]
megatron_params_dict.update(true_megatron_model.named_parameters()) megatron_params_dict = {}
for hf_name, megatron_name in name_pairs: for vpp_rank in range(self._vpp_size):
megatron_param = megatron_params_dict[megatron_name] megatron_params_dict.update({vpp_rank: dict(true_megatron_model[vpp_rank].named_buffers())})
param = _transfer_from_megatron_division(megatron_param, megatron_name) megatron_params_dict[vpp_rank].update(true_megatron_model[vpp_rank].named_parameters())
weight_buffer.copy_by_name(hf_name, param)
for hf_name, vpp_rank, megatron_name in name_pairs:
if((self._infer_ep_size > 1 or self._ep_size > 1) and "mlp.experts" in megatron_name):
pass
else:
megatron_param = megatron_params_dict[vpp_rank][megatron_name]
param = _transfer_from_megatron_division(megatron_param, megatron_name)
weight_buffer.copy_by_name(hf_name, param)
# tp md5 validate # tp md5 validate
if self.enable_validate: if self.enable_validate:
tp_md5_validate(self.infer_params_for_md5, self.origin_params_for_md5, tp_md5_validate(self.infer_params_for_md5, self.origin_params_for_md5,
f"rank[{self._rank}] tp params allgather") f"rank[{self._rank}] tp params allgather")
def _update_weight_buffers_ep(self):
# 构造临时的experts_memory_buffers
for cur_pp_rank in range(self._pp_size):
pp_rank = self._pp_rank
from mindspeed_rl.workers.resharding.memory_buffer import build_experts_memory_buffer, get_weight_buffer_meta_from_buffer
# Step1 在当前的PP_rank中设置一个临时的exprts_buffer
combined_names_per_pp = []
vpp_stages = self.weight_names_per_pp[cur_pp_rank]
for weight_names_per_stage in vpp_stages:
combined_names_per_pp.extend(weight_names_per_stage)
self.weight_buffer_meta = self.weight_adaptor.get_weight_buffer_meta(self.vllm_model, combined_names_per_pp)
self.experts_weight_buffer_meta = get_weight_buffer_meta_from_buffer(self.weight_buffer_meta)
self.experts_memory_buffers = build_experts_memory_buffer(self.experts_weight_buffer_meta, self.experts_memory_expend_N)
# Step2 将weights_buffer上对应的权重放到experts_buffer中
if(cur_pp_rank == pp_rank):
weight_names = self.weight_names_per_pp[pp_rank]
weight_names_meta = self.weight_adaptor.convert_weight_name_meta(weight_names)
normal_layer_func = partial(self.weight_adaptor.global2local_layer, num_layer_list=self._vpp_layer_list, global2local_map=self._global2local_map)
name_pairs = sorted(list(set([(name, vpp_rank, self.weight_adaptor.replace_name_i2t(normal_layer_func(name, vpp_rank=vpp_rank)))
for vpp_rank, names_per_vpp in enumerate(weight_names_meta) for name in names_per_vpp])))
true_megatron_model = self._unwrap_megatron_model(self.megatron_model)
megatron_params_dict = {}
# 拿到当前pp的所有权重
for vpp_rank in range(self._vpp_size):
megatron_params_dict.update({vpp_rank: dict(true_megatron_model[vpp_rank].named_buffers())})
megatron_params_dict[vpp_rank].update(true_megatron_model[vpp_rank].named_parameters())
for hf_name, vpp_rank, megatron_name in name_pairs:
if((self._infer_ep_size > 1 or self._ep_size > 1) and "mlp.experts" in megatron_name):
megatron_param = megatron_params_dict[vpp_rank][megatron_name]
dtype = self.experts_weight_buffer_meta[hf_name]['dtype']
self.experts_memory_buffers[dtype].copy_by_name(hf_name, megatron_param)
# Step3 后续的操作可以复用
global_src = dist.get_global_rank(group=self._pp_group, group_rank=cur_pp_rank)
# broadcast专家权重experts memory buffer中的
for dtype, experts_memory_buffer in self.experts_memory_buffers.items():
dist.broadcast(tensor=experts_memory_buffer.data, src=global_src, group=self._pp_group, async_op=False)
pp_group_rank = self._rank // self.pp_group_size
# 获取对应的dtype
for name, tensor_indices_value in sorted(experts_memory_buffer.tensor_indices.items()):
shape = tensor_indices_value[1] # 是*N的
index = pp_group_rank % self.experts_memory_expend_N
experts_tensor = experts_memory_buffer.get_by_name(name)
experts_tensor_reshape = experts_tensor.view(shape)
weight_tensor_infer = experts_tensor_reshape[index]
self.weight_buffers[cur_pp_rank].copy_by_name(name, weight_tensor_infer)
# 卸载专家的buffer
experts_memory_buffer = None
self.experts_memory_buffers[dtype] = None
for memory_buffer in self.experts_memory_buffers.values():
memory_buffer = None
self.experts_memory_buffers = None
def _update_weight_buffers_inter_pp(self): def _update_weight_buffers_inter_pp(self):
""" """
Update weight buffers by gathering weights from other pp stage. Update weight buffers by gathering weights from other pp stage.
@ -328,6 +456,36 @@ class MegatronStyleVllmWeightContainer:
dist.broadcast(md5_tensor_src, group=self._pp_group, src=global_src, async_op=False) dist.broadcast(md5_tensor_src, group=self._pp_group, src=global_src, async_op=False)
validate_md5(md5_tensor_src, md5_tensor, f"rank[{self._rank}] pp resharding params") validate_md5(md5_tensor_src, md5_tensor, f"rank[{self._rank}] pp resharding params")
def get_expert_router(self, cur_rank, train_tp_ep_size, infer_tp_ep_size, world_size):
for tp_ep_group_id in range(world_size // infer_tp_ep_size):
tp_ep_group = [i for i in range(tp_ep_group_id * infer_tp_ep_size, (tp_ep_group_id + 1) * infer_tp_ep_size)]
if cur_rank in tp_ep_group:
self.INFER_TP_EP_GROUP = tp_ep_group
stride = infer_tp_ep_size // train_tp_ep_size
dev_array = np.array(self.INFER_TP_EP_GROUP).reshape(stride, train_tp_ep_size)
src_router = np.squeeze(dev_array.transpose().reshape(1, infer_tp_ep_size)).tolist()
src = src_router[cur_rank % infer_tp_ep_size]
dst = self.INFER_TP_EP_GROUP[src_router.index(cur_rank)]
return src, dst
def _send_receive_experts(self):
cur_rank = dist.get_rank()
src_rank, dst_rank = self.get_expert_router(cur_rank, self._ep_size, self._infer_ep_size, self._world_size)
for cur_pp_rank in range(self._pp_size):
for memory_buffer in self.weight_buffers[cur_pp_rank].memory_buffers.values():
for name in sorted(memory_buffer.tensor_indices.keys()):
if "mlp.experts" in name:
# 做收发
tensor_to_send = memory_buffer.get_by_name(name)
tensor_to_replace = torch.empty_like(tensor_to_send)
send_op = dist.P2POp(dist.isend, tensor_to_send, dst_rank)
recv_op = dist.P2POp(dist.irecv, tensor_to_replace, src_rank)
reqs = dist.batch_isend_irecv([send_op, recv_op])
for req in reqs:
req.wait()
memory_buffer.copy_by_name(name, tensor_to_replace)
def _get_all_params(self): def _get_all_params(self):
"""Get all the parameters of the models in all pp ranks """Get all the parameters of the models in all pp ranks
@ -353,7 +511,7 @@ class MegatronStyleVllmWeightContainer:
return return
if self._tp_size % self._infer_tp_size != 0: if self._tp_size % self._infer_tp_size != 0:
raise ValueError("self._tp_size must be divisible by self._infer_tp_size") raise ValueError("self._tp_size must be divisible by self._infer_tp_size")
tp_allgather_size = self._tp_size // self._infer_tp_size tp_allgather_size = self._tp_size
if mindspeed_rl.workers.resharding.utils._TP_ALLGATHER_GROUP is not None: if mindspeed_rl.workers.resharding.utils._TP_ALLGATHER_GROUP is not None:
raise RuntimeError("Group for allgather tensor model parallel weight is already initialized") raise RuntimeError("Group for allgather tensor model parallel weight is already initialized")
num_groups = self._world_size // tp_allgather_size num_groups = self._world_size // tp_allgather_size
@ -432,7 +590,7 @@ class MegatronStyleVllmWeightContainer:
2. split train_tp params into groups (size: infer_tp_size) 2. split train_tp params into groups (size: infer_tp_size)
3. return the corresponding param from group based on infer tp rank 3. return the corresponding param from group based on infer tp rank
""" """
if self._infer_tp_size <= self._tp_size: if self._infer_tp_size <= self._tp_size or is_fake_tp_param(name, self.moe_tp_extend_ep):
return param return param
tp_group = get_tp_group() tp_group = get_tp_group()
@ -494,6 +652,9 @@ class MegatronStyleVllmWeightContainer:
torch.distributed.all_gather(infer_param, param, group=tp_allgather_group) torch.distributed.all_gather(infer_param, param, group=tp_allgather_group)
if self.enable_validate: if self.enable_validate:
update_md5_by_rank(infer_param, param, self.origin_params_for_md5, self.infer_params_for_md5) update_md5_by_rank(infer_param, param, self.origin_params_for_md5, self.infer_params_for_md5)
infer_param = self._default_tp_concat_fn(name, param, infer_param) part_len = len(infer_param) // self._infer_tp_size
start = self._rank % self._infer_tp_size
part_param = infer_param[part_len * start:part_len * (start + 1)]
infer_param = self._default_tp_concat_fn(name, param, part_param)
return infer_param return infer_param

View File

@ -42,6 +42,7 @@ class MegatronVLLMWeightAdaptor(BaseWeightAdaptor):
def __init__(self, model_config): def __init__(self, model_config):
super(MegatronVLLMWeightAdaptor, self).__init__() super(MegatronVLLMWeightAdaptor, self).__init__()
self.model_config = model_config self.model_config = model_config
self.meta_info = None
self.params_mapping = [ self.params_mapping = [
# (megatron core gpt model name, vllm model name) # (megatron core gpt model name, vllm model name)
("embedding.word_embeddings", "model.embed_tokens"), ("embedding.word_embeddings", "model.embed_tokens"),
@ -92,6 +93,8 @@ class MegatronVLLMWeightAdaptor(BaseWeightAdaptor):
""" """
pass pass
def convert_weight_name_meta(self, weight_names):
return weight_names
def get_weight_buffer_meta(self, model, valid_names=None): def get_weight_buffer_meta(self, model, valid_names=None):
weight_buffer_meta = {} weight_buffer_meta = {}
@ -103,11 +106,12 @@ class MegatronVLLMWeightAdaptor(BaseWeightAdaptor):
return weight_buffer_meta return weight_buffer_meta
@staticmethod @staticmethod
def global2local_layer(name, num_layer_list): def global2local_layer(name, num_layer_list, vpp_rank=0, global2local_map=None):
""" """
Transform the model name in each model_chunk in global space to local space Transform the model name in each model_chunk in global space to local space
""" """
layer_name = 'layers' layer_name = 'layers'
num_layer_offset = vpp_rank * sum(num_layer_list)
if layer_name in name: # belong to an intermediate layer if layer_name in name: # belong to an intermediate layer
split_name = name.split('.') split_name = name.split('.')
@ -122,12 +126,15 @@ class MegatronVLLMWeightAdaptor(BaseWeightAdaptor):
raise ValueError(f'split_name = {split_name}') raise ValueError(f'split_name = {split_name}')
# increment layer_num_idx by layer_offset # increment layer_num_idx by layer_offset
global_idx = int(split_name[layer_num_idx]) if global2local_map is None:
for layers_in_pp in num_layer_list: global_idx = int(split_name[layer_num_idx]) - num_layer_offset
global_idx -= layers_in_pp for layers_in_pp in num_layer_list:
if global_idx < 0: global_idx -= layers_in_pp
local_index = global_idx + layers_in_pp if global_idx < 0:
break local_index = global_idx + layers_in_pp
break
else:
local_index = global2local_map[int(split_name[layer_num_idx])]
split_name[layer_num_idx] = str(local_index) split_name[layer_num_idx] = str(local_index)
name = '.'.join(split_name) # weight name in inference_tp_model name = '.'.join(split_name) # weight name in inference_tp_model
@ -135,15 +142,27 @@ class MegatronVLLMWeightAdaptor(BaseWeightAdaptor):
return name return name
@staticmethod @staticmethod
def get_weight_names_per_pp(layer_list, vllm_names): def get_weight_names_per_pp(layer_list, vllm_names, layers_num=None, vpp_size=0, noop_layers=None):
## add protection for default kwargs
if not layers_num:
if vpp_size > 0:
ValueError(f"layers_num is required with vpp_size = {vpp_size}")
layers_num = sum(layer_list)
end_layer = sum(layer_list) - 1 end_layer = layers_num - 1
def get_weight_names_in_range(layer_range, names: list, layer_name='layers') -> list: def get_weight_names_in_range(layer_range, names: list, noop_layers=None, layer_name='layers') -> list:
""" """
Extract weights in a given range and also include the weights before and after the range as needed. Extract weights in a given range and also include the weights before and after the range as needed.
""" """
start, end = layer_range start, end = layer_range
layer_idx_list = [layer_idx for layer_idx in range(start, end + 1)]
if noop_layers:
layer_idx_list = [
layer_idx - sum(1 for i in noop_layers if i <= layer_idx) for layer_idx in layer_idx_list if
layer_idx not in noop_layers
]
last_layer_index = end_layer last_layer_index = end_layer
names_in_range = [] names_in_range = []
@ -160,7 +179,7 @@ class MegatronVLLMWeightAdaptor(BaseWeightAdaptor):
match = re.match(r'.*\.layers\.(\d+)', name) match = re.match(r'.*\.layers\.(\d+)', name)
if match: if match:
layer_num = int(match.group(1)) layer_num = int(match.group(1))
if start <= layer_num <= end: if layer_num in layer_idx_list:
names_in_range.append(name) names_in_range.append(name)
# add names after decode layers # add names after decode layers
@ -172,13 +191,18 @@ class MegatronVLLMWeightAdaptor(BaseWeightAdaptor):
break break
return names_in_range return names_in_range
pp_layers_range = [] stage_layers_num = sum(layer_list)
start_layer = 0 weight_names_per_vpp_combined = [[] for _ in layer_list]
for layers_in_pp_rank in layer_list: for vpp_rank in range(vpp_size):
pp_layers_range.append((start_layer, start_layer + layers_in_pp_rank - 1)) start_layer = vpp_rank * stage_layers_num
start_layer += layers_in_pp_rank for pp_rank, layers_in_vpp_rank in enumerate(layer_list):
weight_names_per_pp = [get_weight_names_in_range(layer_range, vllm_names) for layer_range in pp_layers_range] vpp_layers_range = (start_layer, start_layer + layers_in_vpp_rank - 1)
return weight_names_per_pp weight_names_per_vpp = get_weight_names_in_range(vpp_layers_range, vllm_names, noop_layers)
weight_names_per_vpp_combined[pp_rank].append(weight_names_per_vpp)
start_layer += layers_in_vpp_rank
return weight_names_per_vpp_combined
class DeepSeekMVWeightAdaptor(MegatronVLLMWeightAdaptor): class DeepSeekMVWeightAdaptor(MegatronVLLMWeightAdaptor):
@ -187,6 +211,8 @@ class DeepSeekMVWeightAdaptor(MegatronVLLMWeightAdaptor):
""" """
def __init__(self, model_config): def __init__(self, model_config):
super(DeepSeekMVWeightAdaptor, self).__init__(model_config) super(DeepSeekMVWeightAdaptor, self).__init__(model_config)
self.meta_info = {'replace': {'kv_a_proj_with_mqa': 'qkv_proj'},
'delete': ['q_a_proj']}
self.params_mapping = [ self.params_mapping = [
# (megatron core gpt model name, vllm model name) # (megatron core gpt model name, vllm model name)
("embedding.word_embeddings", "model.embed_tokens"), ("embedding.word_embeddings", "model.embed_tokens"),
@ -216,16 +242,48 @@ class DeepSeekMVWeightAdaptor(MegatronVLLMWeightAdaptor):
if valid_names and name not in valid_names: if valid_names and name not in valid_names:
continue continue
if 'kv_a_proj_with_mqa' in name: if 'kv_a_proj_with_mqa' in name:
q_param = dict(model.named_parameters()).get(name.replace('kv_a_proj_with_mqa', 'q_a_proj')) # 将kv_a_proj_with_mqa和q_a_proj的tensor拼接并用qkv_proj和拼接的结果替换掉原来kv_a_proj_with_mqa的对应部分
q_param = dict(model.named_parameters()).get(name.replace('kv_a_proj_with_mqa', 'q_a_proj' if self.model_config.q_lora_rank else "q_proj"))
qkv_param_shape = torch.cat([q_param, param], dim=0).shape qkv_param_shape = torch.cat([q_param, param], dim=0).shape
qkv_name = name.replace('kv_a_proj_with_mqa', 'qkv_proj') qkv_name = name.replace('kv_a_proj_with_mqa', 'qkv_proj')
weight_buffer_meta[qkv_name] = {'shape': qkv_param_shape, 'dtype': param.dtype} weight_buffer_meta[qkv_name] = {'shape': qkv_param_shape, 'dtype': param.dtype}
elif 'q_a_proj' in name: elif 'q_a_proj' in name or 'q_proj' in name:
continue continue
else: else:
weight_buffer_meta[name] = {'shape': param.shape, 'dtype': param.dtype} weight_buffer_meta[name] = {'shape': param.shape, 'dtype': param.dtype}
return weight_buffer_meta return weight_buffer_meta
def convert_weight_name_meta(self, weight_names):
if not self.meta_info:
return weight_names
weight_names_meta = list()
for elements in weight_names:
if isinstance(elements, list):
tmp_weight_names_meta = self.convert_weight_name_meta(elements)
weight_names_meta.append(tmp_weight_names_meta)
else:
converted = False
if not converted and 'replace' in self.meta_info:
for key, value in self.meta_info['replace'].items():
if key in elements:
qkv_name = elements.replace(key, value)
weight_names_meta.append(qkv_name)
converted = True
break
if not converted and 'delete' in self.meta_info:
for key in self.meta_info['delete']:
if key in elements:
converted = True
break
if not converted:
weight_names_meta.append(elements)
return weight_names_meta
class QwenMVWeightAdaptor(MegatronVLLMWeightAdaptor): class QwenMVWeightAdaptor(MegatronVLLMWeightAdaptor):
""" """
@ -239,6 +297,7 @@ WEIGHT_ADAPTOR_REGISTRY = {
"Qwen2ForCausalLM": QwenMVWeightAdaptor, "Qwen2ForCausalLM": QwenMVWeightAdaptor,
"DeepseekV3ForCausalLM": DeepSeekMVWeightAdaptor, "DeepseekV3ForCausalLM": DeepSeekMVWeightAdaptor,
"DeepseekV2ForCausalLM": DeepSeekMVWeightAdaptor, "DeepseekV2ForCausalLM": DeepSeekMVWeightAdaptor,
"CustomDeepseekV3ForCausalLM": DeepSeekMVWeightAdaptor,
} }

View File

@ -85,6 +85,7 @@ rl_config:
num_npus: 8 num_npus: 8
generate_config: generate_config:
enforce_eager: True
gen_micro_batch_size: 16 gen_micro_batch_size: 16
trust_remote_code: true trust_remote_code: true
offload_train_optimizer: true offload_train_optimizer: true

View File

@ -1,24 +0,0 @@
#!/bin/bash
export CUDA_DEVICE_MAX_CONNECTIONS=1
# 获取脚本的绝对路径
SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
export PYTHONPATH=$SCRIPT_DIR/../../..:$PYTHONPATH
GPUS_PER_NODE=1
MASTER_ADDR=localhost
MASTER_PORT=6555
NNODES=1
NODE_RANK=0
DISTRIBUTED_ARGS="
--nproc_per_node $GPUS_PER_NODE \
--nnodes $NNODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT
"
echo "start test_vllm_engine st"
torchrun $DISTRIBUTED_ARGS $SCRIPT_DIR/test_vllm_engine.py --distribute-backend nccl
torchrun $DISTRIBUTED_ARGS $SCRIPT_DIR/test_vllm_engine_multistep_decode.py --distribute-backend nccl

View File

@ -1,140 +0,0 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (c) Huawei Technologies Co., Ltd.2023-2025. All rights reserved.
import os
import logging
import tensordict
import torch
from torch_npu.contrib import transfer_to_npu
from mindspeed_rl.models.rollout.vllm_engine import VLLMInferEngine
from mindspeed_rl.config_cls.megatron_config import MegatronConfig
from mindspeed_rl.utils.loggers import Loggers
tokenizer_name_or_path = "/data/for_dt/tokenizer/Llama-3.2-1B-Instruct/"
weights_path = "/data/for_dt/weights/Llama-3.2-1B-tp1pp1/iter_0000001/mp_rank_00/model_optim_rng.pt"
megatron_dict = {"num_attention_heads": 32,
"tensor_model_parallel_size": 1,
"num_query_groups": 8,
"group_query_attention": True}
sampling_config = {
"num_completions": 1, # 每个输入提示生成的独立完成项数量
"logprobs": 1, # 返回的 top token 的对数概率数量
"max_tokens": 128, # 生成输出的最大 token 数量
"best_of": 2, # 内部生成候选完成项的数量,从中选择最佳的一个
"top_p": 1.0, # 核采样的累积概率阈值
"top_k": 50, # 采样时考虑的最高概率 token 的数量
"min_p": 0.0, # token 选择的最小概率阈值
"temperature": 0.2, # 控制预测随机性的温度参数
"detokenize": False # 是否将生成的 token 转换回可读字符串
}
def main():
logger = Loggers(
name="test_vllm_engine",
)
logger.info("start test_vllm_engine")
conversation = [
{
"role": "system",
"content": "You are a helpful assistant"
},
{
"role": "user",
"content": "Hello"
},
{
"role": "assistant",
"content": "Hello! How can I assist you today?"
},
{
"role": "user",
"content": "Write an essay about the importance of higher education.",
},
]
logger.info("load megatron weight")
megatron_st = torch.load(weights_path)
actor_weights = megatron_st['model']
# 配置初始化所需的参数
train_tensor_parallel_size = 1
train_pipeline_parallel_size = 1
infer_tensor_parallel_size = 1
infer_pipeline_parallel_size = 1
train_expert_parallel_size = 1
infer_expert_parallel_size = 1
max_num_seqs = 256
trust_remote_code = True
logger.info("enter vllmInferEngine")
megatron_config = MegatronConfig(megatron_dict, {})
megatron_config.num_attention_heads = 32
megatron_config.tensor_model_parallel_size = 1
megatron_config.num_query_groups = 8
megatron_config.num_key_value_heads = 8
megatron_config.group_query_attention = True
# 初始化 VLLMInferEngine 实例
inference_engine = VLLMInferEngine(
megatron_config=megatron_config,
sampling_config=sampling_config,
train_expert_parallel_size=train_expert_parallel_size,
infer_expert_parallel_size=infer_expert_parallel_size,
tokenizer_name_or_path=tokenizer_name_or_path,
train_tensor_parallel_size=train_tensor_parallel_size,
train_pipeline_parallel_size=train_pipeline_parallel_size,
infer_tensor_parallel_size=infer_tensor_parallel_size,
infer_pipeline_parallel_size=infer_pipeline_parallel_size,
max_num_seqs=max_num_seqs,
trust_remote_code=trust_remote_code
)
logger.info("model inited")
inference_engine.free_cache_engine()
torch.cuda.empty_cache()
logger.info("free_cache")
inference_engine.offload_model_weights()
logger.info("offload_model")
torch.cuda.empty_cache()
logger.info("empty_cache")
logger.info("enter sync_model_weights")
inference_engine.sync_model_weights(actor_weights)
logger.info("enter init_cache_engine")
inference_engine.init_cache_engine()
logger.info("=" * 80)
logger.info("start chat")
outputs = inference_engine.chat(conversation)
logger.info("chat result is ", outputs)
idx_list = []
idx_list_per_step = []
for i in range(2):
for j in range(4):
tokens = torch.randint(100, (10,))
idx_list_per_step.append(tokens.view(-1).cpu().numpy().tolist())
idx_list.extend(idx_list_per_step)
idx_list_per_step = []
logger.info(len(idx_list), [len(i) for i in idx_list])
logger.info("start test generate_sequences ")
outputs = inference_engine.generate_sequences(
idx_list=idx_list,
)
logger.info("generate_sequences output is:")
logger.info(outputs[0])
logger.info("input")
logger.info(idx_list[0])
if __name__ == "__main__":
main()

View File

@ -1,141 +0,0 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (c) Huawei Technologies Co., Ltd.2023-2025. All rights reserved.
import os
import logging
import tensordict
import torch
from torch_npu.contrib import transfer_to_npu
from mindspeed_rl.models.rollout.vllm_engine import VLLMInferEngine
from mindspeed_rl.config_cls.megatron_config import MegatronConfig
from mindspeed_rl.utils.loggers import Loggers
tokenizer_name_or_path = "/data/for_dt/tokenizer/Llama-3.2-1B-Instruct/"
weights_path = "/data/for_dt/weights/Llama-3.2-1B-tp1pp1/iter_0000001/mp_rank_00/model_optim_rng.pt"
megatron_dict = {"num_attention_heads": 32,
"tensor_model_parallel_size": 1,
"num_query_groups": 8,
"group_query_attention": True}
sampling_config = {
"num_completions": 1, # 每个输入提示生成的独立完成项数量
"logprobs": 1, # 返回的 top token 的对数概率数量
"max_tokens": 128, # 生成输出的最大 token 数量
"best_of": 2, # 内部生成候选完成项的数量,从中选择最佳的一个
"top_p": 1.0, # 核采样的累积概率阈值
"top_k": 50, # 采样时考虑的最高概率 token 的数量
"min_p": 0.0, # token 选择的最小概率阈值
"temperature": 0.2, # 控制预测随机性的温度参数
"detokenize": False # 是否将生成的 token 转换回可读字符串
}
def main():
logger = Loggers(
name="test_vllm_engine_multistep_decode",
)
logger.info("start test_vllm_engine_multistep_decode")
conversation = [
{
"role": "system",
"content": "You are a helpful assistant"
},
{
"role": "user",
"content": "Hello"
},
{
"role": "assistant",
"content": "Hello! How can I assist you today?"
},
{
"role": "user",
"content": "Write an essay about the importance of higher education.",
},
]
logger.info("load megatron weight")
megatron_st = torch.load(weights_path)
actor_weights = megatron_st['model']
# 配置初始化所需的参数
train_tensor_parallel_size = 1
train_pipeline_parallel_size = 1
infer_tensor_parallel_size = 1
infer_pipeline_parallel_size = 1
train_expert_parallel_size = 1
infer_expert_parallel_size = 1
max_num_seqs = 256
trust_remote_code = True
logger.info("enter vllmInferEngine")
megatron_config = MegatronConfig(megatron_dict, {})
megatron_config.num_attention_heads = 32
megatron_config.tensor_model_parallel_size = 1
megatron_config.num_query_groups = 8
megatron_config.num_key_value_heads = 8
megatron_config.group_query_attention = True
# 初始化 VLLMInferEngine 实例
inference_engine = VLLMInferEngine(
megatron_config=megatron_config,
sampling_config=sampling_config,
train_expert_parallel_size=train_expert_parallel_size,
infer_expert_parallel_size=infer_expert_parallel_size,
tokenizer_name_or_path=tokenizer_name_or_path,
train_tensor_parallel_size=train_tensor_parallel_size,
train_pipeline_parallel_size=train_pipeline_parallel_size,
infer_tensor_parallel_size=infer_tensor_parallel_size,
infer_pipeline_parallel_size=infer_pipeline_parallel_size,
max_num_seqs=max_num_seqs,
trust_remote_code=trust_remote_code,
num_scheduler_steps=8, # 8 decode steps
)
logger.info("model inited")
inference_engine.free_cache_engine()
torch.cuda.empty_cache()
logger.info("free_cache")
inference_engine.offload_model_weights()
logger.info("offload_model")
torch.cuda.empty_cache()
logger.info("empty_cache")
logger.info("enter sync_model_weights")
inference_engine.sync_model_weights(actor_weights)
logger.info("enter init_cache_engine")
inference_engine.init_cache_engine()
logger.info("=" * 80)
logger.info("start chat")
outputs = inference_engine.chat(conversation)
logger.info("chat result is ", outputs)
idx_list = []
idx_list_per_step = []
for i in range(2):
for j in range(4):
tokens = torch.randint(100, (10,))
idx_list_per_step.append(tokens.view(-1).cpu().numpy().tolist())
idx_list.extend(idx_list_per_step)
idx_list_per_step = []
logger.info(len(idx_list), [len(i) for i in idx_list])
logger.info("start test generate_sequences ")
outputs = inference_engine.generate_sequences(
idx_list=idx_list,
)
logger.info("generate_sequences output is:")
logger.info(outputs[0])
logger.info("input")
logger.info(idx_list[0])
if __name__ == "__main__":
main()

View File

@ -1,45 +1,54 @@
#!/bin/bash # #!/bin/bash
export CUDA_DEVICE_MAX_CONNECTIONS=1 # export CUDA_DEVICE_MAX_CONNECTIONS=1
SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd) # SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
export PYTHONPATH=$SCRIPT_DIR/../../..:$PYTHONPATH # export PYTHONPATH=$SCRIPT_DIR/../../..:$PYTHONPATH
GPUS_PER_NODE=8 # export VLLM_DP_SIZE=1
MASTER_ADDR=localhost # export HCCL_BUFFSIZE=256
MASTER_PORT=6555 # export VLLM_USE_V1=1
NNODES=1 # export VLLM_VERSION=0.9.0
NODE_RANK=0 # export VLLM_ENABLE_GRAPH_MODE=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) # export VLLM_ENABLE_MC2=0
# export HCCL_OP_EXPANSION_MODE="AIV"
# export VLLM_ENABLE_TOPK_OPTIMZE=1
DISTRIBUTED_ARGS=" # GPUS_PER_NODE=8
--nproc_per_node $GPUS_PER_NODE \ # MASTER_ADDR=localhost
--nnodes $NNODES \ # MASTER_PORT=6555
--node_rank $NODE_RANK \ # NNODES=1
--master_addr $MASTER_ADDR \ # NODE_RANK=0
--master_port $MASTER_PORT # WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
"
PYTHON_ARGS="
--model-path "/data/for_dt/weights/Qwen2.5-7B-mg" \
--tokenizer-path "/data/for_dt/weights/Qwen2.5-7B" \
--train-tp 4 \
--train-pp 2 \
--train-ep 1 \
--infer-tp 2 \
--infer-pp 1 \
--infer-ep 1
"
PYTHON_ARGS_new="
--model-path "/data/for_dt/weights/Qwen2.5-7B-tp2pp2" \
--tokenizer-path "/data/for_dt/weights/Qwen2.5-7B" \
--train-tp 2 \
--train-pp 2 \
--train-ep 1 \
--infer-tp 4 \
--infer-pp 1 \
--infer-ep 1
"
echo "start test_resharding st" # DISTRIBUTED_ARGS="
# --nproc_per_node $GPUS_PER_NODE \
# --nnodes $NNODES \
# --node_rank $NODE_RANK \
# --master_addr $MASTER_ADDR \
# --master_port $MASTER_PORT
# "
# PYTHON_ARGS="
# --model-path "/data/for_dt/weights/Qwen2.5-7B-mg" \
# --tokenizer-path "/data/for_dt/weights/Qwen2.5-7B" \
# --train-tp 4 \
# --train-pp 2 \
# --train-ep 1 \
# --infer-tp 2 \
# --infer-pp 1 \
# --infer-ep 1
# "
# PYTHON_ARGS_new="
# --model-path "/data/for_dt/weights/Qwen2.5-7B-tp2pp2" \
# --tokenizer-path "/data/for_dt/weights/Qwen2.5-7B" \
# --train-tp 2 \
# --train-pp 2 \
# --train-ep 1 \
# --infer-tp 4 \
# --infer-pp 1 \
# --infer-ep 1
# "
torchrun $DISTRIBUTED_ARGS $SCRIPT_DIR/test_resharding.py $PYTHON_ARGS # echo "start test_resharding st"
torchrun $DISTRIBUTED_ARGS $SCRIPT_DIR/test_resharding.py $PYTHON_ARGS_new # torchrun $DISTRIBUTED_ARGS $SCRIPT_DIR/test_resharding.py $PYTHON_ARGS
# torchrun $DISTRIBUTED_ARGS $SCRIPT_DIR/test_resharding.py $PYTHON_ARGS_new

View File

@ -306,6 +306,7 @@ class TestActor():
train_tensor_parallel_size=args.train_tp, train_tensor_parallel_size=args.train_tp,
train_pipeline_parallel_size=args.train_pp, train_pipeline_parallel_size=args.train_pp,
train_expert_parallel_size=args.train_ep, train_expert_parallel_size=args.train_ep,
train_context_parallel_size=args.train_cp,
infer_tensor_parallel_size=args.infer_tp, infer_tensor_parallel_size=args.infer_tp,
infer_pipeline_parallel_size=args.infer_pp, infer_pipeline_parallel_size=args.infer_pp,
infer_expert_parallel_size=args.infer_ep, infer_expert_parallel_size=args.infer_ep,
@ -315,6 +316,7 @@ class TestActor():
dtype="bfloat16", dtype="bfloat16",
gpu_memory_utilization=0.6, gpu_memory_utilization=0.6,
trust_remote_code=True, trust_remote_code=True,
enforce_eager=True,
megatron_config=megatron_config megatron_config=megatron_config
) )
self.megatron_offloader = MegatronOffLoader(self.model, self.optimizer) self.megatron_offloader = MegatronOffLoader(self.model, self.optimizer)
@ -364,6 +366,7 @@ def parse_args():
parser.add_argument("--train-tp", type=int, default=2) parser.add_argument("--train-tp", type=int, default=2)
parser.add_argument("--train-pp", type=int, default=2) parser.add_argument("--train-pp", type=int, default=2)
parser.add_argument("--train-ep", type=int, default=1) parser.add_argument("--train-ep", type=int, default=1)
parser.add_argument("--train_cp", type=int, default=1)
parser.add_argument("--infer-tp", type=int, default=4) parser.add_argument("--infer-tp", type=int, default=4)
parser.add_argument("--infer-pp", type=int, default=1) parser.add_argument("--infer-pp", type=int, default=1)
parser.add_argument("--infer-ep", type=int, default=1) parser.add_argument("--infer-ep", type=int, default=1)