Fix bugs in initial_load_in_hf when enable_weight_tying=true in Qwen3 (#1999)

shuhuayu · Achazwl · web-flow · commit 5ecc8719acba · 2025-11-07T00:56:13.000-08:00
Rebased on main to merge this pr: #1964 --------- Co-authored-by: William <acha131441373@gmail.com> Co-authored-by: Achazwl <323163497@qq.com>
diff --git a/torchtitan/models/qwen3/model/state_dict_adapter.py b/torchtitan/models/qwen3/model/state_dict_adapter.py
@@ -104,6 +104,8 @@ def to_hf(self, state_dict: dict[str, Any]) -> dict[str, Any]:
             else:
                 if key not in to_hf_map:
                     continue
+                if self.model_args.enable_weight_tying and key == "output.weight":
+                    continue
                 new_key = to_hf_map[key]
                 hf_state_dict[new_key] = value
 
@@ -118,6 +120,13 @@ def from_hf(self, hf_state_dict: dict[str, Any]) -> dict[str, Any]:
         state_dict = {}
         expert_weights_by_layer = {}  # {layer: {abstract_key: {expert_id: tensor}}}
 
+        if (
+            self.model_args.enable_weight_tying
+            and "lm_head.weight" not in hf_state_dict
+        ):
+            assert "model.embed_tokens.weight" in hf_state_dict
+            hf_state_dict["lm_head.weight"] = hf_state_dict["model.embed_tokens.weight"]
+
         for key, value in hf_state_dict.items():
             if "mlp.experts" in key:
                 abstract_key = re.sub(r"(\d+)", "{}", key, count=2)