Merge pull request #2474 from AI-Hypercomputer:mohit/tokamax-gmm

Google-ML-Automation · Google-ML-Automation · commit 6b1ef88fbea0 · 2025-10-23T09:50:10.000-07:00
PiperOrigin-RevId: 823078777
diff --git a/requirements.txt b/requirements.txt
@@ -37,6 +37,7 @@ tensorflow-datasets
 tensorflow-text
 tensorflow
 tiktoken
+tokamax>=0.0.3
 transformers
 google-jetstream @ https://github.com/AI-Hypercomputer/JetStream/archive/29329e8e73820993f77cfc8efe34eb2a73f5de98.zip
 mlperf-logging @ https://github.com/mlcommons/logging/archive/38ab22670527888c8eb7825a4ece176fcc36a95d.zip
diff --git a/requirements_with_jax_ai_image.txt b/requirements_with_jax_ai_image.txt
@@ -23,4 +23,5 @@ sentencepiece>=0.2.0
 tensorflow-datasets
 tensorflow-text>=2.17.0
 tiktoken
+tokamax>=0.0.3
 transformers
diff --git a/src/MaxText/configs/base.yml b/src/MaxText/configs/base.yml
@@ -878,3 +878,6 @@ gdn_num_value_heads: 32
 gdn_chunk_size: 64
 # Whether to apply L2 normalization to query and key tensors inside the Gated Delta Rule kernel.
 use_qk_norm_in_gdn: True
+
+# Use tokamax library for gmm kernel implementation
+use_tokamax_gmm: false
diff --git a/src/MaxText/layers/moe.py b/src/MaxText/layers/moe.py
@@ -36,6 +36,9 @@
 from MaxText.layers import attentions, linears, quantizations, nnx_wrappers
 from MaxText.layers.initializers import NdInitializer, nd_dense_init, default_bias_init, variable_to_logically_partitioned
 
+if jax.__version__ >= "0.8.0":
+  from tokamax._src.ops.ragged_dot import api as tokamax_api
+
 set_xla_metadata = xla_metadata.set_xla_metadata
 
 
@@ -807,16 +810,26 @@ def gmm(inputs, kernel, tiling, group_sizes, expert_assignments):
           min(tiling[2], n),
       )
       if self.config.megablox:
-        output = mblx.gmm(
-            lhs=inputs,
-            rhs=kernel,
-            group_sizes=group_sizes,
-            preferred_element_type=self.dtype,
-            tiling=tiling,
-            lhs_quantize_dtype=lhs_quantize_dtype,
-            rhs_quantize_dtype=rhs_quantize_dtype,
-            use_qwix_quantization=self.config.use_qwix_quantization,
-        )
+        if self.config.use_tokamax_gmm:
+          output = tokamax_api.ragged_dot(  #  pylint: disable=possibly-used-before-assignment
+              lhs=inputs,
+              rhs=kernel,
+              group_sizes=group_sizes,
+              precision=jax.lax.Precision.DEFAULT,
+              preferred_element_type=self.dtype,
+              implementation="mosaic",
+          )
+        else:
+          output = mblx.gmm(
+              lhs=inputs,
+              rhs=kernel,
+              group_sizes=group_sizes,
+              preferred_element_type=self.dtype,
+              tiling=tiling,
+              lhs_quantize_dtype=lhs_quantize_dtype,
+              rhs_quantize_dtype=rhs_quantize_dtype,
+              use_qwix_quantization=self.config.use_qwix_quantization,
+          )
       else:
         rhs_inputs = kernel
         if isinstance(kernel, aqt.QTensor):
diff --git a/src/MaxText/pyconfig.py b/src/MaxText/pyconfig.py
@@ -290,6 +290,12 @@ def validate_quantization_methods(keys):
       raise ValueError(f"Invalid quantization method {keys['quantization']}. Valid options are {valid_quant_methods}")
 
 
+def validate_tokamax_usage(keys):
+  """Validate tokamax usage for gmm kernel"""
+  if keys["use_tokamax_gmm"] and keys["hardware"] != "tpu":
+    raise ValueError(f"Invalid tokamax's megablox kernel usage for hardware {keys['hardware']}. Only TPU is supported.")
+
+
 def validate_data_input(keys):
   """validate provided parameters for data input"""
   if not keys["hf_access_token"]:
@@ -737,6 +743,7 @@ def user_init(raw_keys):
     validate_data_input(raw_keys)
     validate_constant_bound(raw_keys)
     validate_quantization_methods(raw_keys)
+    validate_tokamax_usage(raw_keys)
 
     raw_keys["decoder_block"] = DecoderBlockType(raw_keys["decoder_block"])