mirror of
https://github.com/huggingface/transformers.git
synced 2025-11-02 18:54:35 +08:00
Compare commits
5 Commits
v4.56.0
...
moe-attrib
| Author | SHA1 | Date | |
|---|---|---|---|
| a59a3b3e24 | |||
| 90441d3946 | |||
| 4bcac5ed53 | |||
| 7dba66cc42 | |||
| db6aa9a2f0 |
@ -134,6 +134,11 @@ class AriaTextConfig(PretrainedConfig):
|
||||
|
||||
model_type = "aria_text"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {
|
||||
"num_experts": "moe_num_experts",
|
||||
"num_experts_per_tok": "moe_topk",
|
||||
"num_shared_experts": "moe_num_shared_experts",
|
||||
}
|
||||
# Default tensor parallel plan for base model `AriaTextModel`
|
||||
base_model_tp_plan = {
|
||||
"layers.*.self_attn.q_proj": "colwise",
|
||||
|
||||
@ -210,6 +210,9 @@ class DbrxConfig(PretrainedConfig):
|
||||
else:
|
||||
self.ffn_config = ffn_config
|
||||
|
||||
self.num_experts = self.ffn_config.moe_num_experts
|
||||
self.num_experts_per_tok = self.ffn_config.moe_top_k
|
||||
self.norm_topk_prob = self.ffn_config.moe_norm_topk_prob
|
||||
self.d_model = d_model
|
||||
self.n_heads = n_heads
|
||||
self.n_layers = n_layers
|
||||
|
||||
@ -125,6 +125,11 @@ class DeepseekV2Config(PretrainedConfig):
|
||||
|
||||
model_type = "deepseek_v2"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {
|
||||
"num_experts_per_tok": "top_k",
|
||||
"num_shared_experts": "n_shared_experts",
|
||||
"num_experts": "n_routed_experts",
|
||||
}
|
||||
|
||||
base_model_tp_plan = {
|
||||
"layers.*.self_attn.q_proj": "colwise",
|
||||
|
||||
@ -132,6 +132,9 @@ class DeepseekV3Config(PretrainedConfig):
|
||||
|
||||
model_type = "deepseek_v3"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {
|
||||
"num_experts_per_tok": "top_k",
|
||||
}
|
||||
base_model_tp_plan = { # TODO: only replicate attention layers when > first_k_dense_replace
|
||||
"layers.*.mlp.experts.*.gate_proj": "local_colwise",
|
||||
"layers.*.mlp.experts.*.up_proj": "local_colwise",
|
||||
|
||||
@ -139,6 +139,9 @@ class DogeConfig(PretrainedConfig):
|
||||
|
||||
model_type = "doge"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {
|
||||
"num_experts_per_tok": "top_k",
|
||||
}
|
||||
# Default tensor parallel plan for base model `DogeModel`
|
||||
base_model_tp_plan = {
|
||||
"layers.*.self_attn.q_proj": "colwise",
|
||||
|
||||
@ -105,6 +105,9 @@ class Dots1Config(PretrainedConfig):
|
||||
|
||||
model_type = "dots1"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {
|
||||
"num_experts_per_tok": "top_k",
|
||||
}
|
||||
|
||||
base_model_tp_plan = { # TODO: only replicate attention layers when > first_k_dense_replace
|
||||
"layers.*.self_attn.q_proj": "colwise",
|
||||
|
||||
@ -147,6 +147,9 @@ class Glm4MoeConfig(PretrainedConfig):
|
||||
|
||||
model_type = "glm4_moe"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {
|
||||
"num_experts_per_tok": "top_k",
|
||||
}
|
||||
|
||||
# Default tensor parallel plan for base model `Glm4Moe`
|
||||
base_model_tp_plan = {
|
||||
|
||||
@ -222,6 +222,9 @@ class Glm4vMoeTextConfig(PretrainedConfig):
|
||||
|
||||
model_type = "Glm4vMoe_text"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {
|
||||
"num_experts_per_tok": "top_k",
|
||||
}
|
||||
# Default tensor parallel plan for base model `Glm4vMoe`
|
||||
base_model_tp_plan = {
|
||||
"layers.*.self_attn.q_proj": "colwise",
|
||||
|
||||
@ -44,6 +44,10 @@ class GptOssConfig(PretrainedConfig):
|
||||
"layers.*.mlp.experts.down_proj": "grouped_gemm",
|
||||
"layers.*.mlp.experts.down_proj_bias": "grouped_gemm",
|
||||
}
|
||||
attribute_map = {
|
||||
"num_experts": "num_local_experts",
|
||||
"num_experts_per_tok": "top_k",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@ -116,6 +116,10 @@ class GraniteMoeConfig(PretrainedConfig):
|
||||
|
||||
model_type = "granitemoe"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {
|
||||
"num_experts": "num_local_experts",
|
||||
"num_experts_per_tok": "top_k",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@ -133,6 +133,8 @@ class GraniteMoeHybridConfig(PretrainedConfig):
|
||||
model_type = "granitemoehybrid"
|
||||
attribute_map = {
|
||||
"layers_block_type": "layer_types",
|
||||
"num_experts": "num_local_experts",
|
||||
"num_experts_per_tok": "top_k",
|
||||
}
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
|
||||
@ -118,6 +118,10 @@ class GraniteMoeSharedConfig(PretrainedConfig):
|
||||
|
||||
model_type = "granitemoeshared"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {
|
||||
"num_experts": "num_local_experts",
|
||||
"num_experts_per_tok": "top_k",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@ -124,6 +124,9 @@ class JambaConfig(PretrainedConfig):
|
||||
|
||||
model_type = "jamba"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {
|
||||
"num_experts_per_tok": "top_k",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@ -94,6 +94,10 @@ class JetMoeConfig(PretrainedConfig):
|
||||
|
||||
model_type = "jetmoe"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {
|
||||
"num_experts": "num_local_experts",
|
||||
"num_experts_per_tok": "top_k",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@ -275,6 +275,10 @@ class Llama4TextConfig(PretrainedConfig):
|
||||
"layers.*.feed_forward.down_proj": "local_rowwise",
|
||||
"layers.*.feed_forward.router": "ep_router",
|
||||
}
|
||||
attribute_map = {
|
||||
"num_experts": "num_local_experts",
|
||||
"num_experts_per_tok": "top_k",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@ -141,6 +141,10 @@ class MiniMaxConfig(PretrainedConfig):
|
||||
"layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
|
||||
"norm": (["hidden_states"], ["hidden_states"]),
|
||||
}
|
||||
attribute_map = {
|
||||
"num_experts": "num_local_experts",
|
||||
"num_experts_per_tok": "top_k",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@ -124,6 +124,10 @@ class MixtralConfig(PretrainedConfig):
|
||||
"layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
|
||||
"norm": (["hidden_states"], ["hidden_states"]),
|
||||
}
|
||||
attribute_map = {
|
||||
"num_experts": "num_local_experts",
|
||||
"num_experts_per_tok": "top_k",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@ -108,6 +108,9 @@ class OlmoeConfig(PretrainedConfig):
|
||||
|
||||
model_type = "olmoe"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {
|
||||
"num_experts_per_tok": "top_k",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@ -112,6 +112,9 @@ class PhimoeConfig(PretrainedConfig):
|
||||
|
||||
model_type = "phimoe"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {
|
||||
"num_experts": "num_local_experts",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@ -150,6 +150,9 @@ class Qwen2MoeConfig(PretrainedConfig):
|
||||
|
||||
model_type = "qwen2_moe"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {
|
||||
"num_experts_per_tok": "top_k",
|
||||
}
|
||||
|
||||
# Default tensor parallel plan for base model `Qwen2Moe`
|
||||
base_model_tp_plan = {
|
||||
|
||||
@ -147,6 +147,9 @@ class Qwen3MoeConfig(PretrainedConfig):
|
||||
|
||||
model_type = "qwen3_moe"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {
|
||||
"num_experts_per_tok": "top_k",
|
||||
}
|
||||
|
||||
# Default tensor parallel plan for base model `Qwen3Moe`
|
||||
base_model_tp_plan = {
|
||||
|
||||
Reference in New Issue
Block a user