diff --git a/docs/source/user_guide/feature_guide/eplb_swift_balancer.md b/docs/source/user_guide/feature_guide/eplb_swift_balancer.md index 707b2632d..010aa6a92 100644 --- a/docs/source/user_guide/feature_guide/eplb_swift_balancer.md +++ b/docs/source/user_guide/feature_guide/eplb_swift_balancer.md @@ -16,7 +16,7 @@ Expert balancing for MoE models in LLM serving is essential for optimal performa ### Dynamic EPLB -Enable dynamic balancing with auto-tuned parameters. Adjust num_iterations_eplb_update and num_wait_worker_iterations based on workload patterns. +We need to add environment variable `export PYTHONOPTIMIZE=1` to get context of vllm process. Enable dynamic balancing with auto-tuned parameters. Adjust num_iterations_eplb_update and num_wait_worker_iterations based on workload patterns. ```shell vllm serve Qwen/Qwen3-235B-A22 \ @@ -25,7 +25,6 @@ vllm serve Qwen/Qwen3-235B-A22 \ --additional-config '{ "dynamic_eplb": true, "num_iterations_eplb_update": 400, - "gate_eplb": true, "num_wait_worker_iterations": 30 }' ``` @@ -42,9 +41,7 @@ vllm serve Qwen/Qwen3-235B-A22 \ --additional-config '{ "expert_map_record_path": "/path/to/eplb.json", "init_redundancy_expert": 16, - "dynamic_eplb": true, "num_iterations_eplb_update": 400, - "gate_eplb": true, "num_wait_worker_iterations": 30 }' ``` diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index 1f25f8fb8..ae19c886a 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -137,7 +137,8 @@ class EplbUpdator: self.compute_and_set_moe_load(is_clear=True) self.wakeup_eplb_worker() - if self.update_expert_weight_flag(): + if self.update_expert_weight_flag( + ) and self.expert_map_record_path is None: self.eplb_loader.update_expert_map_and_weight(self.reqs) self.update_iteration() diff --git a/vllm_ascend/ops/common_fused_moe.py b/vllm_ascend/ops/common_fused_moe.py index dd2b16616..c1a32c660 100644 --- a/vllm_ascend/ops/common_fused_moe.py +++ b/vllm_ascend/ops/common_fused_moe.py @@ -26,7 +26,8 @@ from vllm.forward_context import get_forward_context from vllm.logger import logger from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig from vllm.model_executor.layers.fused_moe.layer import ( - FusedMoE, UnquantizedFusedMoEMethod, determine_expert_map) + FusedMoE, UnquantizedFusedMoEMethod, determine_expert_map, + get_compressed_expert_map) from vllm.model_executor.layers.shared_fused_moe import SharedFusedMoE from vllm_ascend.ascend_config import get_ascend_config @@ -176,7 +177,7 @@ class AscendFusedMoE(FusedMoE): self.moe_config.ep_group = get_ep_group() self.moe_config.mc2_group = get_mc2_group() ascend_config = get_ascend_config() - self.dynamic_eplb = ascend_config.dynamic_eplb + self.dynamic_eplb = ascend_config.dynamic_eplb or ascend_config.expert_map_record_path self.expert_map_path = ascend_config.expert_map_path self.global_redundant_expert_num = ascend_config.init_redundancy_expert self.global_num_experts = num_experts + self.global_redundant_expert_num @@ -203,6 +204,14 @@ class AscendFusedMoE(FusedMoE): self.log2phy = determine_default_log2phy_map( self.global_num_experts, self.ep_size, self.ep_rank, self.global_redundant_expert_num).npu() + if self.expert_map is not None and isinstance( + self.expert_map, torch.Tensor): + logger.info_once( + "[EP Rank %s/%s] Expert parallelism is enabled. Local/global" + " number of experts: %s/%s. Experts local to global index map:" + " %s.", self.ep_rank, self.ep_size, self.local_num_experts, + self.global_num_experts, + get_compressed_expert_map(self.expert_map)) else: # init moe. self.local_num_experts, self.expert_map = determine_expert_map( @@ -216,6 +225,14 @@ class AscendFusedMoE(FusedMoE): self.log2phy = determine_default_log2phy_map( self.global_num_experts, self.ep_size, self.ep_rank, self.global_redundant_expert_num).npu() + if self.expert_map is not None and isinstance( + self.expert_map, torch.Tensor): + logger.info_once( + "[EP Rank %s/%s] Expert parallelism is enabled. Local/global" + " number of experts: %s/%s. Experts local to global index map:" + " %s.", self.ep_rank, self.ep_size, self.local_num_experts, + self.global_num_experts, + get_compressed_expert_map(self.expert_map)) local_num_experts = (torch.sum( self.expert_map != -1) if self.expert_map is not None else self.global_num_experts) diff --git a/vllm_ascend/quantization/w4a8_dynamic.py b/vllm_ascend/quantization/w4a8_dynamic.py index 4f4dbb048..c8c1eeb69 100644 --- a/vllm_ascend/quantization/w4a8_dynamic.py +++ b/vllm_ascend/quantization/w4a8_dynamic.py @@ -140,7 +140,8 @@ class AscendW4A8DynamicFusedMoEMethod: # NOTE: new quantize weights: 2 int4 pack into int8 self.new_quant_version = quant_version == "1.0.0" self.tp_size = 1 if vllm_config.parallel_config.enable_expert_parallel else self.ep_group.world_size - self.dynamic_eplb = get_ascend_config().dynamic_eplb + ascend_config = get_ascend_config() + self.dynamic_eplb = ascend_config.dynamic_eplb or ascend_config.expert_map_record_path if self.new_quant_version and self.tp_size > 16: raise ValueError( "The current weight does not support moe part tp>16.") diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py index 978826e5c..598e0e50a 100644 --- a/vllm_ascend/quantization/w8a8_dynamic.py +++ b/vllm_ascend/quantization/w8a8_dynamic.py @@ -124,7 +124,7 @@ class AscendW8A8DynamicFusedMoEMethod: vllm_config.compilation_config.level == CompilationLevel.PIECEWISE and not vllm_config.model_config.enforce_eager and not ascend_config.torchair_graph_config.enabled) - self.dynamic_eplb = ascend_config.dynamic_eplb + self.dynamic_eplb = ascend_config.dynamic_eplb or ascend_config.expert_map_record_path try: device_group = get_mc2_group().device_group diff --git a/vllm_ascend/torchair/ops/torchair_fused_moe.py b/vllm_ascend/torchair/ops/torchair_fused_moe.py index e7d009306..a54700836 100644 --- a/vllm_ascend/torchair/ops/torchair_fused_moe.py +++ b/vllm_ascend/torchair/ops/torchair_fused_moe.py @@ -35,7 +35,8 @@ from vllm.model_executor.layers.fused_moe.config import \ from vllm.model_executor.layers.fused_moe.config import \ FusedMoEParallelConfig # isort: skip from vllm.model_executor.layers.fused_moe.layer import ( - FusedMoE, UnquantizedFusedMoEMethod, determine_expert_map) + FusedMoE, UnquantizedFusedMoEMethod, determine_expert_map, + get_compressed_expert_map) from vllm.model_executor.layers.quantization.base_config import \ QuantizationConfig @@ -1028,7 +1029,7 @@ class TorchairAscendFusedMoE(FusedMoE): self.moe_parallel_config.ep_size, is_deepseek_v3_r1) ascend_config = get_ascend_config() - self.dynamic_eplb = ascend_config.dynamic_eplb + self.dynamic_eplb = ascend_config.dynamic_eplb or ascend_config.expert_map_record_path self.expert_map_path = ascend_config.expert_map_path self.global_redundant_expert_num = ascend_config.init_redundancy_expert self.global_num_experts = num_experts + self.global_redundant_expert_num @@ -1055,6 +1056,14 @@ class TorchairAscendFusedMoE(FusedMoE): self.log2phy = determine_default_log2phy_map( self.global_num_experts, self.ep_size, self.ep_rank, self.global_redundant_expert_num).npu() + if self.expert_map is not None and isinstance( + self.expert_map, torch.Tensor): + logger.info_once( + "[EP Rank %s/%s] Expert parallelism is enabled. Local/global" + " number of experts: %s/%s. Experts local to global index map:" + " %s.", self.ep_rank, self.ep_size, self.local_num_experts, + self.global_num_experts, + get_compressed_expert_map(self.expert_map)) else: # init moe. self.local_num_experts, self.expert_map = determine_expert_map( @@ -1068,6 +1077,14 @@ class TorchairAscendFusedMoE(FusedMoE): self.log2phy = determine_default_log2phy_map( self.global_num_experts, self.ep_size, self.ep_rank, self.global_redundant_expert_num).npu() + if self.expert_map is not None and isinstance( + self.expert_map, torch.Tensor): + logger.info_once( + "[EP Rank %s/%s] Expert parallelism is enabled. Local/global" + " number of experts: %s/%s. Experts local to global index map:" + " %s.", self.ep_rank, self.ep_size, self.local_num_experts, + self.global_num_experts, + get_compressed_expert_map(self.expert_map)) local_num_experts = (torch.sum(self.expert_map != -1) if self.expert_map is not None else num_experts) if self.dynamic_eplb: diff --git a/vllm_ascend/torchair/quantization/torchair_w4a8_dynamic.py b/vllm_ascend/torchair/quantization/torchair_w4a8_dynamic.py index 02deee899..ff7b0eeda 100644 --- a/vllm_ascend/torchair/quantization/torchair_w4a8_dynamic.py +++ b/vllm_ascend/torchair/quantization/torchair_w4a8_dynamic.py @@ -134,6 +134,7 @@ class TorchairAscendW4A8DynamicFusedMoEMethod: self.ep_group = get_ep_group() ascend_config = get_ascend_config() + self.dynamic_eplb = ascend_config.dynamic_eplb or ascend_config.expert_map_record_path self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled vllm_config = get_current_vllm_config() @@ -336,7 +337,8 @@ class TorchairAscendW4A8DynamicFusedMoEMethod: is_torchair=self.torchair_graph_enabled, quantized_x_for_share=shared_gate_up, dynamic_scale_for_share=shared_dequant_scale, - mc2_mask=kwargs.get("mc2_mask", None)) + mc2_mask=kwargs.get("mc2_mask", None), + dynamic_eplb=self.dynamic_eplb) else: # The current implementation of deepseek moe splits hidden_states # according to tp_size before they are feed into layers module. diff --git a/vllm_ascend/torchair/quantization/torchair_w8a8_dynamic.py b/vllm_ascend/torchair/quantization/torchair_w8a8_dynamic.py index b933db6b6..573933a16 100644 --- a/vllm_ascend/torchair/quantization/torchair_w8a8_dynamic.py +++ b/vllm_ascend/torchair/quantization/torchair_w8a8_dynamic.py @@ -848,7 +848,7 @@ class TorchairAscendW8A8DynamicFusedMoEMethod: self.ep_group = get_ep_group() ascend_config = get_ascend_config() - self.dynamic_eplb = ascend_config.dynamic_eplb + self.dynamic_eplb = ascend_config.dynamic_eplb or ascend_config.expert_map_record_path self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 3253492b2..e30fc6684 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -475,8 +475,7 @@ class NPUModelRunner(LoRAModelRunnerMixin): dtype=torch.bool, device=self.device, ) - - self.dynamic_eplb = self.ascend_config.dynamic_eplb + self.dynamic_eplb = self.ascend_config.dynamic_eplb or self.ascend_config.expert_map_record_path if self.dynamic_eplb: self.is_eplb_warmuped = False self.policy_type = self.ascend_config.eplb_policy_type