mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 23:03:52 +08:00
Compare commits
3 Commits
v0.11.0rc2
...
marlin_gpt
Author | SHA1 | Date | |
---|---|---|---|
e8c78d992c | |||
d643d6a418 | |||
d256cd23c1 |
@ -161,6 +161,9 @@ def fused_marlin_moe(hidden_states: torch.Tensor,
|
||||
if activation == "silu":
|
||||
torch.ops._C.silu_and_mul(intermediate_cache2,
|
||||
intermediate_cache1.view(-1, 2 * N))
|
||||
elif activation == "gelu":
|
||||
torch.ops._C.gelu_and_mul(intermediate_cache2,
|
||||
intermediate_cache1.view(-1, 2 * N))
|
||||
elif activation == "swigluoai":
|
||||
# alpha = 1.702, limit = 7.0
|
||||
torch.ops._C.swigluoai_and_mul(intermediate_cache2,
|
||||
@ -209,6 +212,8 @@ def fused_marlin_moe(hidden_states: torch.Tensor,
|
||||
def fused_marlin_moe_fake(hidden_states: torch.Tensor,
|
||||
w1: torch.Tensor,
|
||||
w2: torch.Tensor,
|
||||
bias1: Optional[torch.Tensor],
|
||||
bias2: Optional[torch.Tensor],
|
||||
w1_scale: torch.Tensor,
|
||||
w2_scale: torch.Tensor,
|
||||
gating_output: torch.Tensor,
|
||||
@ -217,9 +222,10 @@ def fused_marlin_moe_fake(hidden_states: torch.Tensor,
|
||||
quant_type_id: int,
|
||||
apply_router_weight_on_input: bool = False,
|
||||
global_num_experts: int = -1,
|
||||
activation: Optional[str] = "silu",
|
||||
expert_map: Optional[torch.Tensor] = None,
|
||||
global_scale1: Optional[torch.Tensor] = None,
|
||||
global_scale2: Optional[torch.Tensor] = None,
|
||||
expert_map: Optional[torch.Tensor] = None,
|
||||
g_idx1: Optional[torch.Tensor] = None,
|
||||
g_idx2: Optional[torch.Tensor] = None,
|
||||
sort_indices1: Optional[torch.Tensor] = None,
|
||||
|
@ -518,8 +518,6 @@ class AWQMoEMethod(FusedMoEMethodBase):
|
||||
raise NotImplementedError(
|
||||
"EPLB not supported for `AWQMoEMethod` yet.")
|
||||
|
||||
assert activation == "silu", "Only SiLU activation is supported."
|
||||
|
||||
topk_weights, topk_ids = FusedMoE.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
@ -548,6 +546,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
|
||||
quant_type_id=self.quant_type.id,
|
||||
apply_router_weight_on_input=apply_router_weight_on_input,
|
||||
global_num_experts=global_num_experts,
|
||||
activation=activation,
|
||||
expert_map=expert_map,
|
||||
w1_zeros=layer.w13_qzeros,
|
||||
w2_zeros=layer.w2_qzeros,
|
||||
|
@ -373,7 +373,6 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
|
||||
if enable_eplb:
|
||||
raise NotImplementedError("EPLB not supported for "
|
||||
"`CompressedTensorsW4A4MoeMethod` yet.")
|
||||
assert activation == "silu", "Only SiLU activation is supported."
|
||||
|
||||
topk_weights, topk_ids = FusedMoE.select_experts(
|
||||
hidden_states=x,
|
||||
@ -412,10 +411,14 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
|
||||
quant_type_id=scalar_types.float4_e2m1f.id,
|
||||
apply_router_weight_on_input=apply_router_weight_on_input,
|
||||
global_num_experts=global_num_experts,
|
||||
activation=activation,
|
||||
expert_map=expert_map,
|
||||
workspace=layer.workspace)
|
||||
|
||||
elif self.fused_experts is not None:
|
||||
assert activation == "silu", "Only SiLU activation is supported."
|
||||
|
||||
# FlashInfer fused experts path
|
||||
if self.fused_experts is not None:
|
||||
assert is_valid_flashinfer_cutlass_fused_moe(
|
||||
x, layer.w13_weight, layer.w2_weight), (
|
||||
"Flashinfer CUTLASS Fused MoE not applicable!")
|
||||
@ -871,8 +874,6 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
|
||||
# cutlass fp8 or fused_experts but not marlin or rocm.
|
||||
#
|
||||
if self.use_marlin:
|
||||
assert activation == "silu", (
|
||||
f"{activation} not supported for Marlin MoE.")
|
||||
assert self.fused_experts is None
|
||||
return torch.ops.vllm.fused_marlin_moe(
|
||||
x,
|
||||
@ -888,6 +889,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
|
||||
quant_type_id=scalar_types.float8_e4m3fn.id,
|
||||
apply_router_weight_on_input=apply_router_weight_on_input,
|
||||
global_num_experts=global_num_experts,
|
||||
activation=activation,
|
||||
expert_map=expert_map,
|
||||
workspace=layer.workspace)
|
||||
|
||||
@ -1411,9 +1413,6 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
|
||||
"EPLB not supported for "
|
||||
"`CompressedTensorsWNA16MarlinMoEMethod` yet.")
|
||||
|
||||
assert activation == "silu", (
|
||||
f"{activation} not supported for Marlin MoE.")
|
||||
|
||||
topk_weights, topk_ids = FusedMoE.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
@ -1442,6 +1441,7 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
|
||||
quant_type_id=self.quant_type.id,
|
||||
apply_router_weight_on_input=apply_router_weight_on_input,
|
||||
global_num_experts=global_num_experts,
|
||||
activation=activation,
|
||||
expert_map=expert_map,
|
||||
g_idx1=layer.w13_weight_g_idx,
|
||||
g_idx2=layer.w2_weight_g_idx,
|
||||
|
@ -1003,8 +1003,6 @@ class Fp8MoEMethod(FusedMoEMethodBase):
|
||||
expert_map=expert_map,
|
||||
quant_config=self.moe_quant_config)
|
||||
elif self.use_marlin:
|
||||
assert activation == "silu", (
|
||||
f"{activation} not supported for Marlin MoE.")
|
||||
assert self.fused_experts is None
|
||||
return torch.ops.vllm.fused_marlin_moe(
|
||||
x,
|
||||
@ -1020,6 +1018,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
|
||||
quant_type_id=scalar_types.float8_e4m3fn.id,
|
||||
apply_router_weight_on_input=apply_router_weight_on_input,
|
||||
global_num_experts=global_num_experts,
|
||||
activation=activation,
|
||||
expert_map=expert_map,
|
||||
workspace=layer.workspace)
|
||||
elif self.fused_experts:
|
||||
|
@ -667,8 +667,6 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
|
||||
raise NotImplementedError(
|
||||
"EPLB not supported for `GPTQMarlinMoEMethod` yet.")
|
||||
|
||||
assert activation == "silu", "Only SiLU activation is supported."
|
||||
|
||||
topk_weights, topk_ids = FusedMoE.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
@ -697,6 +695,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
|
||||
quant_type_id=self.quant_type.id,
|
||||
apply_router_weight_on_input=apply_router_weight_on_input,
|
||||
global_num_experts=global_num_experts,
|
||||
activation=activation,
|
||||
expert_map=expert_map,
|
||||
g_idx1=layer.w13_g_idx,
|
||||
g_idx2=layer.w2_g_idx,
|
||||
|
@ -1411,13 +1411,13 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
|
||||
if enable_eplb:
|
||||
raise NotImplementedError(
|
||||
"EPLB not supported for `ModelOptNvFp4FusedMoE` yet.")
|
||||
assert activation == "silu", "Only SiLU activation is supported."
|
||||
|
||||
if (self.allow_flashinfer and self.flashinfer_moe_backend
|
||||
== FlashinferMoeBackend.TENSORRT_LLM):
|
||||
import flashinfer
|
||||
|
||||
from vllm.model_executor.models.llama4 import Llama4MoE
|
||||
assert activation == "silu", "Only SiLU activation is supported."
|
||||
|
||||
assert self.fused_experts is None
|
||||
|
||||
@ -1507,6 +1507,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
|
||||
quant_type_id=scalar_types.float4_e2m1f.id,
|
||||
apply_router_weight_on_input=apply_router_weight_on_input,
|
||||
global_num_experts=global_num_experts,
|
||||
activation=activation,
|
||||
expert_map=expert_map,
|
||||
workspace=layer.workspace)
|
||||
|
||||
|
Reference in New Issue
Block a user