Update fused_marlin_moe_fake

Signed-off-by: mgoin <mgoin64@gmail.com>
Merge branch 'main' into marlin_gptoss_swiglu
2025-10-20 23:03:52 +08:00 · 2025-09-18 11:43:58 -07:00 · 2025-09-18 10:59:27 -07:00 · 2025-09-02 09:24:24 -04:00
6 changed files with 19 additions and 15 deletions
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@ -161,6 +161,9 @@ def fused_marlin_moe(hidden_states: torch.Tensor,
    if activation == "silu":
        torch.ops._C.silu_and_mul(intermediate_cache2,
                                  intermediate_cache1.view(-1, 2 * N))
+    elif activation == "gelu":
+        torch.ops._C.gelu_and_mul(intermediate_cache2,
+                                  intermediate_cache1.view(-1, 2 * N))
    elif activation == "swigluoai":
        # alpha = 1.702, limit = 7.0
        torch.ops._C.swigluoai_and_mul(intermediate_cache2,
@ -209,6 +212,8 @@ def fused_marlin_moe(hidden_states: torch.Tensor,
 def fused_marlin_moe_fake(hidden_states: torch.Tensor,
                          w1: torch.Tensor,
                          w2: torch.Tensor,
+                          bias1: Optional[torch.Tensor],
+                          bias2: Optional[torch.Tensor],
                          w1_scale: torch.Tensor,
                          w2_scale: torch.Tensor,
                          gating_output: torch.Tensor,
@ -217,9 +222,10 @@ def fused_marlin_moe_fake(hidden_states: torch.Tensor,
                          quant_type_id: int,
                          apply_router_weight_on_input: bool = False,
                          global_num_experts: int = -1,
+                          activation: Optional[str] = "silu",
+                          expert_map: Optional[torch.Tensor] = None,
                          global_scale1: Optional[torch.Tensor] = None,
                          global_scale2: Optional[torch.Tensor] = None,
-                          expert_map: Optional[torch.Tensor] = None,
                          g_idx1: Optional[torch.Tensor] = None,
                          g_idx2: Optional[torch.Tensor] = None,
                          sort_indices1: Optional[torch.Tensor] = None,
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@ -518,8 +518,6 @@ class AWQMoEMethod(FusedMoEMethodBase):
            raise NotImplementedError(
                "EPLB not supported for `AWQMoEMethod` yet.")

-        assert activation == "silu", "Only SiLU activation is supported."
-
        topk_weights, topk_ids = FusedMoE.select_experts(
            hidden_states=x,
            router_logits=router_logits,
@ -548,6 +546,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
            quant_type_id=self.quant_type.id,
            apply_router_weight_on_input=apply_router_weight_on_input,
            global_num_experts=global_num_experts,
+            activation=activation,
            expert_map=expert_map,
            w1_zeros=layer.w13_qzeros,
            w2_zeros=layer.w2_qzeros,
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@ -373,7 +373,6 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
        if enable_eplb:
            raise NotImplementedError("EPLB not supported for "
                                      "`CompressedTensorsW4A4MoeMethod` yet.")
-        assert activation == "silu", "Only SiLU activation is supported."

        topk_weights, topk_ids = FusedMoE.select_experts(
            hidden_states=x,
@ -412,10 +411,14 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
                quant_type_id=scalar_types.float4_e2m1f.id,
                apply_router_weight_on_input=apply_router_weight_on_input,
                global_num_experts=global_num_experts,
+                activation=activation,
                expert_map=expert_map,
                workspace=layer.workspace)

-        elif self.fused_experts is not None:
+        assert activation == "silu", "Only SiLU activation is supported."
+
+        # FlashInfer fused experts path
+        if self.fused_experts is not None:
            assert is_valid_flashinfer_cutlass_fused_moe(
                x, layer.w13_weight, layer.w2_weight), (
                    "Flashinfer CUTLASS Fused MoE not applicable!")
@ -871,8 +874,6 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
        # cutlass fp8 or fused_experts but not marlin or rocm.
        #
        if self.use_marlin:
-            assert activation == "silu", (
-                f"{activation} not supported for Marlin MoE.")
            assert self.fused_experts is None
            return torch.ops.vllm.fused_marlin_moe(
                x,
@ -888,6 +889,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                quant_type_id=scalar_types.float8_e4m3fn.id,
                apply_router_weight_on_input=apply_router_weight_on_input,
                global_num_experts=global_num_experts,
+                activation=activation,
                expert_map=expert_map,
                workspace=layer.workspace)

@ -1411,9 +1413,6 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
                "EPLB not supported for "
                "`CompressedTensorsWNA16MarlinMoEMethod` yet.")

-        assert activation == "silu", (
-            f"{activation} not supported for Marlin MoE.")
-
        topk_weights, topk_ids = FusedMoE.select_experts(
            hidden_states=x,
            router_logits=router_logits,
@ -1442,6 +1441,7 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
            quant_type_id=self.quant_type.id,
            apply_router_weight_on_input=apply_router_weight_on_input,
            global_num_experts=global_num_experts,
+            activation=activation,
            expert_map=expert_map,
            g_idx1=layer.w13_weight_g_idx,
            g_idx2=layer.w2_weight_g_idx,
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@ -1003,8 +1003,6 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                expert_map=expert_map,
                quant_config=self.moe_quant_config)
        elif self.use_marlin:
-            assert activation == "silu", (
-                f"{activation} not supported for Marlin MoE.")
            assert self.fused_experts is None
            return torch.ops.vllm.fused_marlin_moe(
                x,
@ -1020,6 +1018,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                quant_type_id=scalar_types.float8_e4m3fn.id,
                apply_router_weight_on_input=apply_router_weight_on_input,
                global_num_experts=global_num_experts,
+                activation=activation,
                expert_map=expert_map,
                workspace=layer.workspace)
        elif self.fused_experts:
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@ -667,8 +667,6 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
            raise NotImplementedError(
                "EPLB not supported for `GPTQMarlinMoEMethod` yet.")

-        assert activation == "silu", "Only SiLU activation is supported."
-
        topk_weights, topk_ids = FusedMoE.select_experts(
            hidden_states=x,
            router_logits=router_logits,
@ -697,6 +695,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
            quant_type_id=self.quant_type.id,
            apply_router_weight_on_input=apply_router_weight_on_input,
            global_num_experts=global_num_experts,
+            activation=activation,
            expert_map=expert_map,
            g_idx1=layer.w13_g_idx,
            g_idx2=layer.w2_g_idx,
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@ -1411,13 +1411,13 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
        if enable_eplb:
            raise NotImplementedError(
                "EPLB not supported for `ModelOptNvFp4FusedMoE` yet.")
-        assert activation == "silu", "Only SiLU activation is supported."

        if (self.allow_flashinfer and self.flashinfer_moe_backend
                == FlashinferMoeBackend.TENSORRT_LLM):
            import flashinfer

            from vllm.model_executor.models.llama4 import Llama4MoE
+            assert activation == "silu", "Only SiLU activation is supported."

            assert self.fused_experts is None

@ -1507,6 +1507,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                quant_type_id=scalar_types.float4_e2m1f.id,
                apply_router_weight_on_input=apply_router_weight_on_input,
                global_num_experts=global_num_experts,
+                activation=activation,
                expert_map=expert_map,
                workspace=layer.workspace)
Author	SHA1	Message	Date
mgoin	e8c78d992c	Update fused_marlin_moe_fake Signed-off-by: mgoin <mgoin64@gmail.com>	2025-09-18 11:43:58 -07:00
mgoin	d643d6a418	Merge branch 'main' into marlin_gptoss_swiglu Signed-off-by: mgoin <mgoin64@gmail.com>	2025-09-18 10:59:27 -07:00
Tyler Michael Smith	d256cd23c1	activation plumbing for fused_marlin_moe Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>	2025-09-02 09:24:24 -04:00