mirror of
https://github.com/vllm-project/vllm-ascend.git
synced 2025-10-20 13:43:53 +08:00
[EPLB]Record expert map without dynamic eplb. (#3409)
What this PR does / why we need it? 1.Record expert map without dynamic eplb. 2.Add export PYTHONOPTIMIZE=1 when using dynamic eplb. 3.change eplb doc Does this PR introduce any user-facing change? How was this patch tested? Qwen3_moe in A3. - vLLM version: v0.11.0 --------- Signed-off-by: offline0806 <3337230449@qq.com> Co-authored-by: offline0806 <3337230449@qq.com>
This commit is contained in:
@ -16,7 +16,7 @@ Expert balancing for MoE models in LLM serving is essential for optimal performa
|
||||
|
||||
### Dynamic EPLB
|
||||
|
||||
Enable dynamic balancing with auto-tuned parameters. Adjust num_iterations_eplb_update and num_wait_worker_iterations based on workload patterns.
|
||||
We need to add environment variable `export PYTHONOPTIMIZE=1` to get context of vllm process. Enable dynamic balancing with auto-tuned parameters. Adjust num_iterations_eplb_update and num_wait_worker_iterations based on workload patterns.
|
||||
|
||||
```shell
|
||||
vllm serve Qwen/Qwen3-235B-A22 \
|
||||
@ -25,7 +25,6 @@ vllm serve Qwen/Qwen3-235B-A22 \
|
||||
--additional-config '{
|
||||
"dynamic_eplb": true,
|
||||
"num_iterations_eplb_update": 400,
|
||||
"gate_eplb": true,
|
||||
"num_wait_worker_iterations": 30
|
||||
}'
|
||||
```
|
||||
@ -42,9 +41,7 @@ vllm serve Qwen/Qwen3-235B-A22 \
|
||||
--additional-config '{
|
||||
"expert_map_record_path": "/path/to/eplb.json",
|
||||
"init_redundancy_expert": 16,
|
||||
"dynamic_eplb": true,
|
||||
"num_iterations_eplb_update": 400,
|
||||
"gate_eplb": true,
|
||||
"num_wait_worker_iterations": 30
|
||||
}'
|
||||
```
|
||||
|
@ -137,7 +137,8 @@ class EplbUpdator:
|
||||
self.compute_and_set_moe_load(is_clear=True)
|
||||
self.wakeup_eplb_worker()
|
||||
|
||||
if self.update_expert_weight_flag():
|
||||
if self.update_expert_weight_flag(
|
||||
) and self.expert_map_record_path is None:
|
||||
self.eplb_loader.update_expert_map_and_weight(self.reqs)
|
||||
|
||||
self.update_iteration()
|
||||
|
@ -26,7 +26,8 @@ from vllm.forward_context import get_forward_context
|
||||
from vllm.logger import logger
|
||||
from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
|
||||
from vllm.model_executor.layers.fused_moe.layer import (
|
||||
FusedMoE, UnquantizedFusedMoEMethod, determine_expert_map)
|
||||
FusedMoE, UnquantizedFusedMoEMethod, determine_expert_map,
|
||||
get_compressed_expert_map)
|
||||
from vllm.model_executor.layers.shared_fused_moe import SharedFusedMoE
|
||||
|
||||
from vllm_ascend.ascend_config import get_ascend_config
|
||||
@ -176,7 +177,7 @@ class AscendFusedMoE(FusedMoE):
|
||||
self.moe_config.ep_group = get_ep_group()
|
||||
self.moe_config.mc2_group = get_mc2_group()
|
||||
ascend_config = get_ascend_config()
|
||||
self.dynamic_eplb = ascend_config.dynamic_eplb
|
||||
self.dynamic_eplb = ascend_config.dynamic_eplb or ascend_config.expert_map_record_path
|
||||
self.expert_map_path = ascend_config.expert_map_path
|
||||
self.global_redundant_expert_num = ascend_config.init_redundancy_expert
|
||||
self.global_num_experts = num_experts + self.global_redundant_expert_num
|
||||
@ -203,6 +204,14 @@ class AscendFusedMoE(FusedMoE):
|
||||
self.log2phy = determine_default_log2phy_map(
|
||||
self.global_num_experts, self.ep_size, self.ep_rank,
|
||||
self.global_redundant_expert_num).npu()
|
||||
if self.expert_map is not None and isinstance(
|
||||
self.expert_map, torch.Tensor):
|
||||
logger.info_once(
|
||||
"[EP Rank %s/%s] Expert parallelism is enabled. Local/global"
|
||||
" number of experts: %s/%s. Experts local to global index map:"
|
||||
" %s.", self.ep_rank, self.ep_size, self.local_num_experts,
|
||||
self.global_num_experts,
|
||||
get_compressed_expert_map(self.expert_map))
|
||||
else:
|
||||
# init moe.
|
||||
self.local_num_experts, self.expert_map = determine_expert_map(
|
||||
@ -216,6 +225,14 @@ class AscendFusedMoE(FusedMoE):
|
||||
self.log2phy = determine_default_log2phy_map(
|
||||
self.global_num_experts, self.ep_size, self.ep_rank,
|
||||
self.global_redundant_expert_num).npu()
|
||||
if self.expert_map is not None and isinstance(
|
||||
self.expert_map, torch.Tensor):
|
||||
logger.info_once(
|
||||
"[EP Rank %s/%s] Expert parallelism is enabled. Local/global"
|
||||
" number of experts: %s/%s. Experts local to global index map:"
|
||||
" %s.", self.ep_rank, self.ep_size, self.local_num_experts,
|
||||
self.global_num_experts,
|
||||
get_compressed_expert_map(self.expert_map))
|
||||
local_num_experts = (torch.sum(
|
||||
self.expert_map != -1) if self.expert_map is not None else
|
||||
self.global_num_experts)
|
||||
|
@ -140,7 +140,8 @@ class AscendW4A8DynamicFusedMoEMethod:
|
||||
# NOTE: new quantize weights: 2 int4 pack into int8
|
||||
self.new_quant_version = quant_version == "1.0.0"
|
||||
self.tp_size = 1 if vllm_config.parallel_config.enable_expert_parallel else self.ep_group.world_size
|
||||
self.dynamic_eplb = get_ascend_config().dynamic_eplb
|
||||
ascend_config = get_ascend_config()
|
||||
self.dynamic_eplb = ascend_config.dynamic_eplb or ascend_config.expert_map_record_path
|
||||
if self.new_quant_version and self.tp_size > 16:
|
||||
raise ValueError(
|
||||
"The current weight does not support moe part tp>16.")
|
||||
|
@ -124,7 +124,7 @@ class AscendW8A8DynamicFusedMoEMethod:
|
||||
vllm_config.compilation_config.level == CompilationLevel.PIECEWISE
|
||||
and not vllm_config.model_config.enforce_eager
|
||||
and not ascend_config.torchair_graph_config.enabled)
|
||||
self.dynamic_eplb = ascend_config.dynamic_eplb
|
||||
self.dynamic_eplb = ascend_config.dynamic_eplb or ascend_config.expert_map_record_path
|
||||
|
||||
try:
|
||||
device_group = get_mc2_group().device_group
|
||||
|
@ -35,7 +35,8 @@ from vllm.model_executor.layers.fused_moe.config import \
|
||||
from vllm.model_executor.layers.fused_moe.config import \
|
||||
FusedMoEParallelConfig # isort: skip
|
||||
from vllm.model_executor.layers.fused_moe.layer import (
|
||||
FusedMoE, UnquantizedFusedMoEMethod, determine_expert_map)
|
||||
FusedMoE, UnquantizedFusedMoEMethod, determine_expert_map,
|
||||
get_compressed_expert_map)
|
||||
from vllm.model_executor.layers.quantization.base_config import \
|
||||
QuantizationConfig
|
||||
|
||||
@ -1028,7 +1029,7 @@ class TorchairAscendFusedMoE(FusedMoE):
|
||||
self.moe_parallel_config.ep_size, is_deepseek_v3_r1)
|
||||
|
||||
ascend_config = get_ascend_config()
|
||||
self.dynamic_eplb = ascend_config.dynamic_eplb
|
||||
self.dynamic_eplb = ascend_config.dynamic_eplb or ascend_config.expert_map_record_path
|
||||
self.expert_map_path = ascend_config.expert_map_path
|
||||
self.global_redundant_expert_num = ascend_config.init_redundancy_expert
|
||||
self.global_num_experts = num_experts + self.global_redundant_expert_num
|
||||
@ -1055,6 +1056,14 @@ class TorchairAscendFusedMoE(FusedMoE):
|
||||
self.log2phy = determine_default_log2phy_map(
|
||||
self.global_num_experts, self.ep_size, self.ep_rank,
|
||||
self.global_redundant_expert_num).npu()
|
||||
if self.expert_map is not None and isinstance(
|
||||
self.expert_map, torch.Tensor):
|
||||
logger.info_once(
|
||||
"[EP Rank %s/%s] Expert parallelism is enabled. Local/global"
|
||||
" number of experts: %s/%s. Experts local to global index map:"
|
||||
" %s.", self.ep_rank, self.ep_size, self.local_num_experts,
|
||||
self.global_num_experts,
|
||||
get_compressed_expert_map(self.expert_map))
|
||||
else:
|
||||
# init moe.
|
||||
self.local_num_experts, self.expert_map = determine_expert_map(
|
||||
@ -1068,6 +1077,14 @@ class TorchairAscendFusedMoE(FusedMoE):
|
||||
self.log2phy = determine_default_log2phy_map(
|
||||
self.global_num_experts, self.ep_size, self.ep_rank,
|
||||
self.global_redundant_expert_num).npu()
|
||||
if self.expert_map is not None and isinstance(
|
||||
self.expert_map, torch.Tensor):
|
||||
logger.info_once(
|
||||
"[EP Rank %s/%s] Expert parallelism is enabled. Local/global"
|
||||
" number of experts: %s/%s. Experts local to global index map:"
|
||||
" %s.", self.ep_rank, self.ep_size, self.local_num_experts,
|
||||
self.global_num_experts,
|
||||
get_compressed_expert_map(self.expert_map))
|
||||
local_num_experts = (torch.sum(self.expert_map != -1)
|
||||
if self.expert_map is not None else num_experts)
|
||||
if self.dynamic_eplb:
|
||||
|
@ -134,6 +134,7 @@ class TorchairAscendW4A8DynamicFusedMoEMethod:
|
||||
self.ep_group = get_ep_group()
|
||||
|
||||
ascend_config = get_ascend_config()
|
||||
self.dynamic_eplb = ascend_config.dynamic_eplb or ascend_config.expert_map_record_path
|
||||
self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
|
||||
|
||||
vllm_config = get_current_vllm_config()
|
||||
@ -336,7 +337,8 @@ class TorchairAscendW4A8DynamicFusedMoEMethod:
|
||||
is_torchair=self.torchair_graph_enabled,
|
||||
quantized_x_for_share=shared_gate_up,
|
||||
dynamic_scale_for_share=shared_dequant_scale,
|
||||
mc2_mask=kwargs.get("mc2_mask", None))
|
||||
mc2_mask=kwargs.get("mc2_mask", None),
|
||||
dynamic_eplb=self.dynamic_eplb)
|
||||
else:
|
||||
# The current implementation of deepseek moe splits hidden_states
|
||||
# according to tp_size before they are feed into layers module.
|
||||
|
@ -848,7 +848,7 @@ class TorchairAscendW8A8DynamicFusedMoEMethod:
|
||||
self.ep_group = get_ep_group()
|
||||
|
||||
ascend_config = get_ascend_config()
|
||||
self.dynamic_eplb = ascend_config.dynamic_eplb
|
||||
self.dynamic_eplb = ascend_config.dynamic_eplb or ascend_config.expert_map_record_path
|
||||
self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
|
||||
self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp
|
||||
|
||||
|
@ -475,8 +475,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
dtype=torch.bool,
|
||||
device=self.device,
|
||||
)
|
||||
|
||||
self.dynamic_eplb = self.ascend_config.dynamic_eplb
|
||||
self.dynamic_eplb = self.ascend_config.dynamic_eplb or self.ascend_config.expert_map_record_path
|
||||
if self.dynamic_eplb:
|
||||
self.is_eplb_warmuped = False
|
||||
self.policy_type = self.ascend_config.eplb_policy_type
|
||||
|
Reference in New Issue
Block a user