Compare commits

...

5 Commits

Author SHA1 Message Date
a59a3b3e24 Revert "num_shared_experts -> num_experts_shared "
This reverts commit 90441d3946163c674d08cd6e3a1d1c002082c1f1.
2025-08-25 16:19:24 +02:00
90441d3946 num_shared_experts -> num_experts_shared
More coherent with other `num_experts_*` configs
2025-08-25 15:00:37 +01:00
4bcac5ed53 update 2025-08-25 15:14:36 +02:00
7dba66cc42 update more attribute maps 2025-08-20 15:19:05 +02:00
db6aa9a2f0 add attribute map 2025-08-14 11:18:27 +02:00
21 changed files with 73 additions and 0 deletions

View File

@ -134,6 +134,11 @@ class AriaTextConfig(PretrainedConfig):
model_type = "aria_text"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"num_experts": "moe_num_experts",
"num_experts_per_tok": "moe_topk",
"num_shared_experts": "moe_num_shared_experts",
}
# Default tensor parallel plan for base model `AriaTextModel`
base_model_tp_plan = {
"layers.*.self_attn.q_proj": "colwise",

View File

@ -210,6 +210,9 @@ class DbrxConfig(PretrainedConfig):
else:
self.ffn_config = ffn_config
self.num_experts = self.ffn_config.moe_num_experts
self.num_experts_per_tok = self.ffn_config.moe_top_k
self.norm_topk_prob = self.ffn_config.moe_norm_topk_prob
self.d_model = d_model
self.n_heads = n_heads
self.n_layers = n_layers

View File

@ -125,6 +125,11 @@ class DeepseekV2Config(PretrainedConfig):
model_type = "deepseek_v2"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"num_experts_per_tok": "top_k",
"num_shared_experts": "n_shared_experts",
"num_experts": "n_routed_experts",
}
base_model_tp_plan = {
"layers.*.self_attn.q_proj": "colwise",

View File

@ -132,6 +132,9 @@ class DeepseekV3Config(PretrainedConfig):
model_type = "deepseek_v3"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"num_experts_per_tok": "top_k",
}
base_model_tp_plan = { # TODO: only replicate attention layers when > first_k_dense_replace
"layers.*.mlp.experts.*.gate_proj": "local_colwise",
"layers.*.mlp.experts.*.up_proj": "local_colwise",

View File

@ -139,6 +139,9 @@ class DogeConfig(PretrainedConfig):
model_type = "doge"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"num_experts_per_tok": "top_k",
}
# Default tensor parallel plan for base model `DogeModel`
base_model_tp_plan = {
"layers.*.self_attn.q_proj": "colwise",

View File

@ -105,6 +105,9 @@ class Dots1Config(PretrainedConfig):
model_type = "dots1"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"num_experts_per_tok": "top_k",
}
base_model_tp_plan = { # TODO: only replicate attention layers when > first_k_dense_replace
"layers.*.self_attn.q_proj": "colwise",

View File

@ -147,6 +147,9 @@ class Glm4MoeConfig(PretrainedConfig):
model_type = "glm4_moe"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"num_experts_per_tok": "top_k",
}
# Default tensor parallel plan for base model `Glm4Moe`
base_model_tp_plan = {

View File

@ -222,6 +222,9 @@ class Glm4vMoeTextConfig(PretrainedConfig):
model_type = "Glm4vMoe_text"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"num_experts_per_tok": "top_k",
}
# Default tensor parallel plan for base model `Glm4vMoe`
base_model_tp_plan = {
"layers.*.self_attn.q_proj": "colwise",

View File

@ -44,6 +44,10 @@ class GptOssConfig(PretrainedConfig):
"layers.*.mlp.experts.down_proj": "grouped_gemm",
"layers.*.mlp.experts.down_proj_bias": "grouped_gemm",
}
attribute_map = {
"num_experts": "num_local_experts",
"num_experts_per_tok": "top_k",
}
def __init__(
self,

View File

@ -116,6 +116,10 @@ class GraniteMoeConfig(PretrainedConfig):
model_type = "granitemoe"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"num_experts": "num_local_experts",
"num_experts_per_tok": "top_k",
}
def __init__(
self,

View File

@ -133,6 +133,8 @@ class GraniteMoeHybridConfig(PretrainedConfig):
model_type = "granitemoehybrid"
attribute_map = {
"layers_block_type": "layer_types",
"num_experts": "num_local_experts",
"num_experts_per_tok": "top_k",
}
keys_to_ignore_at_inference = ["past_key_values"]

View File

@ -118,6 +118,10 @@ class GraniteMoeSharedConfig(PretrainedConfig):
model_type = "granitemoeshared"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"num_experts": "num_local_experts",
"num_experts_per_tok": "top_k",
}
def __init__(
self,

View File

@ -124,6 +124,9 @@ class JambaConfig(PretrainedConfig):
model_type = "jamba"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"num_experts_per_tok": "top_k",
}
def __init__(
self,

View File

@ -94,6 +94,10 @@ class JetMoeConfig(PretrainedConfig):
model_type = "jetmoe"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"num_experts": "num_local_experts",
"num_experts_per_tok": "top_k",
}
def __init__(
self,

View File

@ -275,6 +275,10 @@ class Llama4TextConfig(PretrainedConfig):
"layers.*.feed_forward.down_proj": "local_rowwise",
"layers.*.feed_forward.router": "ep_router",
}
attribute_map = {
"num_experts": "num_local_experts",
"num_experts_per_tok": "top_k",
}
def __init__(
self,

View File

@ -141,6 +141,10 @@ class MiniMaxConfig(PretrainedConfig):
"layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
"norm": (["hidden_states"], ["hidden_states"]),
}
attribute_map = {
"num_experts": "num_local_experts",
"num_experts_per_tok": "top_k",
}
def __init__(
self,

View File

@ -124,6 +124,10 @@ class MixtralConfig(PretrainedConfig):
"layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
"norm": (["hidden_states"], ["hidden_states"]),
}
attribute_map = {
"num_experts": "num_local_experts",
"num_experts_per_tok": "top_k",
}
def __init__(
self,

View File

@ -108,6 +108,9 @@ class OlmoeConfig(PretrainedConfig):
model_type = "olmoe"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"num_experts_per_tok": "top_k",
}
def __init__(
self,

View File

@ -112,6 +112,9 @@ class PhimoeConfig(PretrainedConfig):
model_type = "phimoe"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"num_experts": "num_local_experts",
}
def __init__(
self,

View File

@ -150,6 +150,9 @@ class Qwen2MoeConfig(PretrainedConfig):
model_type = "qwen2_moe"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"num_experts_per_tok": "top_k",
}
# Default tensor parallel plan for base model `Qwen2Moe`
base_model_tp_plan = {

View File

@ -147,6 +147,9 @@ class Qwen3MoeConfig(PretrainedConfig):
model_type = "qwen3_moe"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"num_experts_per_tok": "top_k",
}
# Default tensor parallel plan for base model `Qwen3Moe`
base_model_tp_plan = {