diff --git a/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py b/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py index f7b81216d332..101d699194fd 100644 --- a/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py +++ b/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py @@ -127,8 +127,9 @@ class DeepseekV2Config(PreTrainedConfig): "layers.*.self_attn.q_b_proj": "colwise", "layers.*.self_attn.kv_b_proj": "colwise", "layers.*.self_attn.o_proj": "rowwise", - "layers.*.mlp.gate_up_proj": "colwise", - "layers.*.mlp.down_proj": "rowwise", + "layers.*.mlp.experts.gate_up_proj": "local_colwise", + "layers.*.mlp.experts.down_proj": "local_rowwise", + "layers.*.mlp.experts": "gather", } base_model_pp_plan = { "embed_tokens": (["input_ids"], ["inputs_embeds"]), diff --git a/src/transformers/models/deepseek_v2/modular_deepseek_v2.py b/src/transformers/models/deepseek_v2/modular_deepseek_v2.py index 3deed89d041f..17ab64923470 100644 --- a/src/transformers/models/deepseek_v2/modular_deepseek_v2.py +++ b/src/transformers/models/deepseek_v2/modular_deepseek_v2.py @@ -142,8 +142,9 @@ class DeepseekV2Config(LlamaConfig): "layers.*.self_attn.q_b_proj": "colwise", "layers.*.self_attn.kv_b_proj": "colwise", "layers.*.self_attn.o_proj": "rowwise", - "layers.*.mlp.gate_up_proj": "colwise", - "layers.*.mlp.down_proj": "rowwise", + "layers.*.mlp.experts.gate_up_proj": "local_colwise", + "layers.*.mlp.experts.down_proj": "local_rowwise", + "layers.*.mlp.experts": "gather", } model_type = "deepseek_v2" diff --git a/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py b/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py index f90c5e175ba5..928a0e1fcf7a 100644 --- a/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py +++ b/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py @@ -131,19 +131,16 @@ class DeepseekV3Config(PreTrainedConfig): model_type = "deepseek_v3" keys_to_ignore_at_inference = ["past_key_values"] - base_model_tp_plan = { # TODO: only replicate attention layers when > first_k_dense_replace - "layers.*.mlp.experts.*.gate_proj": "local_colwise", - "layers.*.mlp.experts.*.up_proj": "local_colwise", - "layers.*.mlp.experts.*.down_proj": "local_rowwise", - "layers.*.mlp.experts.*": "local", # each expert is wrapped in a module list - "layers.*.mlp.shared_experts.gate_proj": "local_colwise", - "layers.*.mlp.shared_experts.up_proj": "local_colwise", - "layers.*.mlp.shared_experts.down_proj": "local_rowwise", - "layers.*.mlp.shared_experts": "local", - "layers.*.mlp.gate_proj": "local_colwise", - "layers.*.mlp.up_proj": "local_colwise", - "layers.*.mlp.down_proj": "local_rowwise", - "layers.*.mlp": "gather", # This is the only moment where results are gathered + base_model_tp_plan = { + "layers.*.mlp.experts.gate_up_proj": "local_rowwise", + "layers.*.mlp.experts.down_proj": "local_rowwise", + "layers.*.mlp.experts": "gather", + "layers.*.mlp.shared_experts.gate_proj": "colwise", + "layers.*.mlp.shared_experts.up_proj": "colwise", + "layers.*.mlp.shared_experts.down_proj": "rowwise", + "layers.*.mlp.gate_proj": "colwise", + "layers.*.mlp.up_proj": "colwise", + "layers.*.mlp.down_proj": "rowwise", } base_model_pp_plan = { "embed_tokens": (["input_ids"], ["inputs_embeds"]), diff --git a/src/transformers/models/dots1/configuration_dots1.py b/src/transformers/models/dots1/configuration_dots1.py index a5755ad0a45f..71393a7844ba 100644 --- a/src/transformers/models/dots1/configuration_dots1.py +++ b/src/transformers/models/dots1/configuration_dots1.py @@ -109,23 +109,20 @@ class Dots1Config(PreTrainedConfig): model_type = "dots1" keys_to_ignore_at_inference = ["past_key_values"] - base_model_tp_plan = { # TODO: only replicate attention layers when > first_k_dense_replace + base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise", "layers.*.self_attn.k_proj": "colwise", "layers.*.self_attn.v_proj": "colwise", "layers.*.self_attn.o_proj": "rowwise", - "layers.*.mlp.experts.*.gate_proj": "local_colwise", - "layers.*.mlp.experts.*.up_proj": "local_colwise", - "layers.*.mlp.experts.*.down_proj": "local_rowwise", - "layers.*.mlp.experts.*": "local", # each expert is wrapped in a module list - "layers.*.mlp.shared_experts.gate_proj": "local_colwise", - "layers.*.mlp.shared_experts.up_proj": "local_colwise", - "layers.*.mlp.shared_experts.down_proj": "local_rowwise", - "layers.*.mlp.shared_experts": "local", - "layers.*.mlp.gate_proj": "local_colwise", - "layers.*.mlp.up_proj": "local_colwise", - "layers.*.mlp.down_proj": "local_rowwise", - "layers.*.mlp": "gather", # This is the only moment where results are gathered + "layers.*.mlp.experts.gate_up_proj": "local_rowwise", + "layers.*.mlp.experts.down_proj": "local_rowwise", + "layers.*.mlp.experts": "gather", + "layers.*.mlp.shared_experts.gate_proj": "colwise", + "layers.*.mlp.shared_experts.up_proj": "colwise", + "layers.*.mlp.shared_experts.down_proj": "rowwise", + "layers.*.mlp.gate_proj": "colwise", + "layers.*.mlp.up_proj": "colwise", + "layers.*.mlp.down_proj": "rowwise", } base_model_pp_plan = { diff --git a/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py b/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py index 19ed1853db33..66a299b04c00 100644 --- a/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +++ b/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py @@ -122,21 +122,15 @@ class Ernie4_5_MoeConfig(PreTrainedConfig): "layers.*.self_attn.k_proj": "colwise", "layers.*.self_attn.v_proj": "colwise", "layers.*.self_attn.o_proj": "rowwise", - # sequence parallel is pretty slow - # "norm.weight": "sequence_parallel", - # "layers.*.input_layernorm.weight": "sequence_parallel", - # "layers.*.post_attention_layernorm.weight": "sequence_parallel", - "layers.*.mlp.shared_experts.gate_proj": "local_colwise", - "layers.*.mlp.shared_experts.up_proj": "local_colwise", - "layers.*.mlp.shared_experts.down_proj": "local_rowwise", - "layers.*.mlp.experts.*.gate_proj": "local_colwise", - "layers.*.mlp.experts.*.up_proj": "local_colwise", - "layers.*.mlp.experts.*.down_proj": "local_rowwise", - "layers.*.mlp.experts": "local", - "layers.*.mlp.gate_proj": "local_colwise", - "layers.*.mlp.up_proj": "local_colwise", - "layers.*.mlp.down_proj": "local_rowwise", - "layers.*.mlp": "gather", + "layers.*.mlp.experts.gate_up_proj": "local_rowwise", + "layers.*.mlp.experts.down_proj": "local_rowwise", + "layers.*.mlp.experts": "gather", + "layers.*.mlp.shared_experts.gate_proj": "colwise", + "layers.*.mlp.shared_experts.up_proj": "colwise", + "layers.*.mlp.shared_experts.down_proj": "rowwise", + "layers.*.mlp.gate_proj": "colwise", + "layers.*.mlp.up_proj": "colwise", + "layers.*.mlp.down_proj": "rowwise", } base_model_pp_plan = { "embed_tokens": (["input_ids"], ["inputs_embeds"]), diff --git a/src/transformers/models/flex_olmo/configuration_flex_olmo.py b/src/transformers/models/flex_olmo/configuration_flex_olmo.py index ff93c53ebe7f..635d398b46d9 100644 --- a/src/transformers/models/flex_olmo/configuration_flex_olmo.py +++ b/src/transformers/models/flex_olmo/configuration_flex_olmo.py @@ -115,9 +115,9 @@ class FlexOlmoConfig(PreTrainedConfig): "layers.*.self_attn.k_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k "layers.*.self_attn.v_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k "layers.*.self_attn.o_proj": "rowwise_rep", # we need to replicate here due to the added norm on q and k - "layers.*.mlp.experts.*.gate_proj": "colwise", - "layers.*.mlp.experts.*.up_proj": "colwise", - "layers.*.mlp.experts.*.down_proj": "rowwise", + "layers.*.mlp.experts.gate_up_proj": "local_rowwise", + "layers.*.mlp.experts.down_proj": "local_rowwise", + "layers.*.mlp.experts": "gather", } base_model_pp_plan = { "embed_tokens": (["input_ids"], ["inputs_embeds"]), diff --git a/src/transformers/models/flex_olmo/modular_flex_olmo.py b/src/transformers/models/flex_olmo/modular_flex_olmo.py index a363fe1bb3a4..a25362a71f35 100644 --- a/src/transformers/models/flex_olmo/modular_flex_olmo.py +++ b/src/transformers/models/flex_olmo/modular_flex_olmo.py @@ -125,9 +125,9 @@ class FlexOlmoConfig(OlmoeConfig): "layers.*.self_attn.k_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k "layers.*.self_attn.v_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k "layers.*.self_attn.o_proj": "rowwise_rep", # we need to replicate here due to the added norm on q and k - "layers.*.mlp.experts.*.gate_proj": "colwise", - "layers.*.mlp.experts.*.up_proj": "colwise", - "layers.*.mlp.experts.*.down_proj": "rowwise", + "layers.*.mlp.experts.gate_up_proj": "local_rowwise", + "layers.*.mlp.experts.down_proj": "local_rowwise", + "layers.*.mlp.experts": "gather", } base_model_pp_plan = { "embed_tokens": (["input_ids"], ["inputs_embeds"]), diff --git a/src/transformers/models/glm4_moe/configuration_glm4_moe.py b/src/transformers/models/glm4_moe/configuration_glm4_moe.py index 33d9afd756e5..aa1a16a95b37 100644 --- a/src/transformers/models/glm4_moe/configuration_glm4_moe.py +++ b/src/transformers/models/glm4_moe/configuration_glm4_moe.py @@ -121,9 +121,9 @@ class Glm4MoeConfig(PreTrainedConfig): "layers.*.self_attn.k_proj": "colwise", "layers.*.self_attn.v_proj": "colwise", "layers.*.self_attn.o_proj": "rowwise", - "layers.*.mlp.experts.*.gate_proj": "colwise", - "layers.*.mlp.experts.*.up_proj": "colwise", - "layers.*.mlp.experts.*.down_proj": "rowwise", + "layers.*.mlp.experts.gate_up_proj": "local_rowwise", + "layers.*.mlp.experts.down_proj": "local_rowwise", + "layers.*.mlp.experts": "gather", "layers.*.mlp.gate_proj": "colwise", "layers.*.mlp.up_proj": "colwise", "layers.*.mlp.down_proj": "rowwise", diff --git a/src/transformers/models/glm4_moe/modular_glm4_moe.py b/src/transformers/models/glm4_moe/modular_glm4_moe.py index 471d06d69ff9..0912f2289f2f 100644 --- a/src/transformers/models/glm4_moe/modular_glm4_moe.py +++ b/src/transformers/models/glm4_moe/modular_glm4_moe.py @@ -135,9 +135,9 @@ class Glm4MoeConfig(PreTrainedConfig): "layers.*.self_attn.k_proj": "colwise", "layers.*.self_attn.v_proj": "colwise", "layers.*.self_attn.o_proj": "rowwise", - "layers.*.mlp.experts.*.gate_proj": "colwise", - "layers.*.mlp.experts.*.up_proj": "colwise", - "layers.*.mlp.experts.*.down_proj": "rowwise", + "layers.*.mlp.experts.gate_up_proj": "local_rowwise", + "layers.*.mlp.experts.down_proj": "local_rowwise", + "layers.*.mlp.experts": "gather", "layers.*.mlp.gate_proj": "colwise", "layers.*.mlp.up_proj": "colwise", "layers.*.mlp.down_proj": "rowwise", diff --git a/src/transformers/models/longcat_flash/configuration_longcat_flash.py b/src/transformers/models/longcat_flash/configuration_longcat_flash.py index 6163a0cad785..e99c2b8265c2 100644 --- a/src/transformers/models/longcat_flash/configuration_longcat_flash.py +++ b/src/transformers/models/longcat_flash/configuration_longcat_flash.py @@ -129,9 +129,9 @@ class LongcatFlashConfig(PreTrainedConfig): "layers.*.mlps.*.gate_proj": "colwise", "layers.*.mlps.*.up_proj": "colwise", "layers.*.mlps.*.down_proj": "rowwise", - "layers.*.mlp.experts.*.gate_proj": "colwise", - "layers.*.mlp.experts.*.up_proj": "colwise", - "layers.*.mlp.experts.*.down_proj": "rowwise", + "layers.*.mlp.experts.gate_up_proj": "local_rowwise", + "layers.*.mlp.experts.down_proj": "local_rowwise", + "layers.*.mlp.experts": "gather", } base_model_pp_plan = { diff --git a/src/transformers/models/minimax/configuration_minimax.py b/src/transformers/models/minimax/configuration_minimax.py index 77b971a7d1a9..1e582de1bff8 100644 --- a/src/transformers/models/minimax/configuration_minimax.py +++ b/src/transformers/models/minimax/configuration_minimax.py @@ -138,9 +138,9 @@ class MiniMaxConfig(PreTrainedConfig): "layers.*.self_attn.v_proj": "colwise", "layers.*.self_attn.o_proj": "rowwise", "layers.*.mlp.gate": "colwise_rep", # we need to replicate here to correctly route experts - "layers.*.mlp.experts.*.w1": "colwise", - "layers.*.mlp.experts.*.w2": "rowwise", - "layers.*.mlp.experts.*.w3": "colwise", + "layers.*.mlp.experts.gate_up_proj": "local_rowwise", + "layers.*.mlp.experts.down_proj": "local_rowwise", + "layers.*.mlp.experts": "gather", } base_model_pp_plan = { "embed_tokens": (["input_ids"], ["inputs_embeds"]), diff --git a/src/transformers/models/minimax/modular_minimax.py b/src/transformers/models/minimax/modular_minimax.py index fff1b3fe8745..2f459f770998 100644 --- a/src/transformers/models/minimax/modular_minimax.py +++ b/src/transformers/models/minimax/modular_minimax.py @@ -163,9 +163,9 @@ class MiniMaxConfig(PreTrainedConfig): "layers.*.self_attn.v_proj": "colwise", "layers.*.self_attn.o_proj": "rowwise", "layers.*.mlp.gate": "colwise_rep", # we need to replicate here to correctly route experts - "layers.*.mlp.experts.*.w1": "colwise", - "layers.*.mlp.experts.*.w2": "rowwise", - "layers.*.mlp.experts.*.w3": "colwise", + "layers.*.mlp.experts.gate_up_proj": "local_rowwise", + "layers.*.mlp.experts.down_proj": "local_rowwise", + "layers.*.mlp.experts": "gather", } base_model_pp_plan = { "embed_tokens": (["input_ids"], ["inputs_embeds"]), diff --git a/src/transformers/models/mixtral/configuration_mixtral.py b/src/transformers/models/mixtral/configuration_mixtral.py index 7cf6afc1d342..adc86a035bed 100644 --- a/src/transformers/models/mixtral/configuration_mixtral.py +++ b/src/transformers/models/mixtral/configuration_mixtral.py @@ -115,16 +115,14 @@ class MixtralConfig(PreTrainedConfig): model_type = "mixtral" keys_to_ignore_at_inference = ["past_key_values"] base_model_tp_plan = { - "layers.*.self_attn.q_proj": "local_colwise", - "layers.*.self_attn.k_proj": "local_colwise", - "layers.*.self_attn.v_proj": "local_colwise", - "layers.*.self_attn.o_proj": "local_rowwise", - "layers.*.self_attn": "gather", + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", "layers.*.mlp.gate": "ep_router", # we need to replicate here to correctly route experts "layers.*.mlp.experts.gate_up_proj": "local_colwise", "layers.*.mlp.experts.down_proj": "local_rowwise", "layers.*.mlp.experts": "gather", - # "layers.*.mlp.experts.gate_up_proj": "local_packed_rowwise" ? if you load from } base_model_pp_plan = { "embed_tokens": (["input_ids"], ["inputs_embeds"]), diff --git a/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py b/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py index 5043a3f38a07..8bc756a17267 100644 --- a/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py +++ b/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py @@ -121,9 +121,9 @@ class Qwen3MoeConfig(PreTrainedConfig): "layers.*.self_attn.k_proj": "colwise", "layers.*.self_attn.v_proj": "colwise", "layers.*.self_attn.o_proj": "rowwise", - "layers.*.mlp.experts.*.gate_proj": "colwise", - "layers.*.mlp.experts.*.up_proj": "colwise", - "layers.*.mlp.experts.*.down_proj": "rowwise", + "layers.*.mlp.experts.gate_up_proj": "local_rowwise", + "layers.*.mlp.experts.down_proj": "local_rowwise", + "layers.*.mlp.experts": "gather", "layers.*.mlp.gate_proj": "colwise", "layers.*.mlp.up_proj": "colwise", "layers.*.mlp.down_proj": "rowwise", diff --git a/src/transformers/models/qwen3_next/configuration_qwen3_next.py b/src/transformers/models/qwen3_next/configuration_qwen3_next.py index 17ce194bff84..83eb062cb6f8 100644 --- a/src/transformers/models/qwen3_next/configuration_qwen3_next.py +++ b/src/transformers/models/qwen3_next/configuration_qwen3_next.py @@ -135,9 +135,9 @@ class Qwen3NextConfig(PreTrainedConfig): "layers.*.self_attn.k_proj": "colwise", "layers.*.self_attn.v_proj": "colwise", "layers.*.self_attn.o_proj": "rowwise", - "layers.*.mlp.experts.*.gate_proj": "colwise", - "layers.*.mlp.experts.*.up_proj": "colwise", - "layers.*.mlp.experts.*.down_proj": "rowwise", + "layers.*.mlp.experts.gate_up_proj": "local_rowwise", + "layers.*.mlp.experts.down_proj": "local_rowwise", + "layers.*.mlp.experts": "gather", "layers.*.mlp.shared_expert.gate_proj": "colwise", "layers.*.mlp.shared_expert.up_proj": "colwise", "layers.*.mlp.shared_expert.down_proj": "rowwise", diff --git a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py index b5f36c8ac311..c7746f420514 100644 --- a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py @@ -268,9 +268,9 @@ class Qwen3OmniMoeTextConfig(PreTrainedConfig): "layers.*.self_attn.k_proj": "colwise", "layers.*.self_attn.v_proj": "colwise", "layers.*.self_attn.o_proj": "rowwise", - "layers.*.mlp.experts.*.gate_proj": "colwise", - "layers.*.mlp.experts.*.up_proj": "colwise", - "layers.*.mlp.experts.*.down_proj": "rowwise", + "layers.*.mlp.experts.gate_up_proj": "local_rowwise", + "layers.*.mlp.experts.down_proj": "local_rowwise", + "layers.*.mlp.experts": "gather", "layers.*.mlp.gate_proj": "colwise", "layers.*.mlp.up_proj": "colwise", "layers.*.mlp.down_proj": "rowwise", @@ -712,9 +712,9 @@ class Qwen3OmniMoeTalkerTextConfig(PreTrainedConfig): "layers.*.self_attn.k_proj": "colwise", "layers.*.self_attn.v_proj": "colwise", "layers.*.self_attn.o_proj": "rowwise", - "layers.*.mlp.experts.*.gate_proj": "colwise", - "layers.*.mlp.experts.*.up_proj": "colwise", - "layers.*.mlp.experts.*.down_proj": "rowwise", + "layers.*.mlp.experts.gate_up_proj": "local_rowwise", + "layers.*.mlp.experts.down_proj": "local_rowwise", + "layers.*.mlp.experts": "gather", "layers.*.mlp.gate_proj": "colwise", "layers.*.mlp.up_proj": "colwise", "layers.*.mlp.down_proj": "rowwise",