Merge pull request #2604 from AI-Hypercomputer:rbierneni-qwen3-next-fullattention

Google-ML-Automation · Google-ML-Automation · commit b8fb6686bfea · 2025-11-06T08:45:28.000-08:00
PiperOrigin-RevId: 828972171
diff --git a/src/MaxText/layers/attentions.py b/src/MaxText/layers/attentions.py
@@ -481,9 +481,6 @@ def __init__(
     else:
       self.sinks = None
 
-    self.query_norm = None
-    self.key_norm = None
-
     is_llama4_decoder_block = self.config.decoder_block == DecoderBlockType.LLAMA4
     if self.use_qk_norm and not is_llama4_decoder_block:
       self.query_norm = RMSNorm(
@@ -519,6 +516,9 @@ def __init__(
           weight_dtype=self.config.weight_dtype,
           rngs=self.rngs,
       )
+    else:
+      self.query_norm = None
+      self.key_norm = None
 
     self._maybe_shard_with_logical = functools.partial(
         maybe_shard_with_logical,
diff --git a/tests/train_compile_test.py b/tests/train_compile_test.py
@@ -701,3 +701,19 @@ def test_gpt3_6b(self):
             "per_device_batch_size=1",
         )
     )
+
+  @pytest.mark.cpu_only
+  def test_qwen3_qk_norm(self):
+    """AOT test for non-llama qk norm models"""
+    compiled_trainstep_file = "/tmp/test_qwen3_qk_norm"
+    train_compile_main(
+        (
+            "",
+            os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+            f"compiled_trainstep_file={compiled_trainstep_file}",
+            "compile_topology=v5p-8",
+            "compile_topology_num_slices=1",
+            "model_name=qwen3-0.6b",
+            "per_device_batch_size=1",
+        )
+    )