i honestly can't believe i spelled it that way

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
merge main, add environment variable, factor into function
2025-10-20 23:03:52 +08:00 · 2025-07-04 15:14:03 -04:00 · 2025-07-04 15:11:40 -04:00 · 2025-06-25 16:46:41 -04:00
2 changed files with 52 additions and 0 deletions
--- a/vllm/envs.py
+++ b/vllm/envs.py
@ -128,6 +128,7 @@ if TYPE_CHECKING:
    VLLM_NIXL_SIDE_CHANNEL_PORT: int = 5557
    VLLM_ALL2ALL_BACKEND: str = "naive"
    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840
+    VLLM_UNIFORM_RANDOM_TOPK_IDS: bool = False 
    VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1
    VLLM_SLEEP_WHEN_IDLE: bool = False
    VLLM_MQ_MAX_CHUNK_BYTES_MB: int = 16
@ -913,6 +914,13 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE":
    lambda: int(os.getenv("VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE", "163840")),

+    # Use uniform random topk ids for perfect load balancing in expectation.
+    # Use it for analyzing performance when using --load-format=dummy.
+    # MoE layers will not produce the correct answer when it is set.
+    "VLLM_UNIFORM_RANDOM_TOPK_IDS":
+    lambda: os.environ.get("VLLM_UNIFORM_RANDOM_TOPK_IDS", "false").lower() in
+    ("1", "true"),
+
    # Regex timeout for use by the vLLM tool parsing plugins.
    "VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS":
    lambda: int(os.getenv("VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS", "1")),
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@ -1154,6 +1154,41 @@ class FusedMoE(torch.nn.Module):
        self.logical_to_physical_map = logical_to_physical_map[moe_layer_idx]
        self.logical_replica_count = logical_replica_count[moe_layer_idx]

+    @staticmethod
+    def uniform_random_select_experts(
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        indices_type: Optional[torch.dtype] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Number of tokens in the current batch
+        num_tokens = hidden_states.shape[0]
+
+        # Infer how many experts exist from the router-logit dimension
+        global_num_experts = router_logits.shape[-1]
+
+        # Choose a dtype for the indices
+        if indices_type is None:
+            indices_type = torch.long
+
+        # Random expert IDs, uniform in [0, global_num_experts)
+        topk_ids = torch.randint(
+            low=0,
+            high=global_num_experts,
+            size=(num_tokens, top_k),
+            dtype=indices_type,
+            device=hidden_states.device,
+        )
+
+        # All-ones weights
+        topk_weights = torch.ones(
+            (num_tokens, top_k),
+            dtype=torch.float32,
+            device=hidden_states.device,
+        )
+
+        return topk_weights, topk_ids
+
    @staticmethod
    def select_experts(
        hidden_states: torch.Tensor,
@ -1187,6 +1222,15 @@ class FusedMoE(torch.nn.Module):
        """
        from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk

+        # Uniform random topk ids for performance experiments,
+        # especially when using dummy weights.
+        if envs.VLLM_UNIFORM_RANDOM_TOPK_IDS:
+            return FusedMoE.uniform_random_select_experts(
+                hidden_states,
+                router_logits,
+                top_k,
+                indices_type=indices_type)
+
        # DeepSeekv2 uses grouped_top_k
        if use_grouped_topk:
            assert topk_group is not None
Author	SHA1	Message	Date
Tyler Michael Smith	8209f9057d	i honestly can't believe i spelled it that way Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>	2025-07-04 15:14:03 -04:00
Tyler Michael Smith	19c51c3439	merge main, add environment variable, factor into function Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>	2025-07-04 15:11:40 -04:00
Tyler Michael Smith	14a6efb83e	hack for topk ids Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>	2025-06-25 16:46:41 -04:00