AI-Hypercomputer
diff --git a/‎src/MaxText/configs/base.yml‎
Lines changed: 3 additions & 0 deletions b/‎src/MaxText/configs/base.yml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/MaxText/configs/models/qwen3-omni-30b-a3b.yml‎
Lines changed: 1 addition & 1 deletion b/‎src/MaxText/configs/models/qwen3-omni-30b-a3b.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/MaxText/decode.py‎
Lines changed: 6 additions & 4 deletions b/‎src/MaxText/decode.py‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎src/MaxText/layers/decoders.py‎
Lines changed: 2 additions & 0 deletions b/‎src/MaxText/layers/decoders.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/MaxText/layers/encoders.py‎
Lines changed: 4 additions & 0 deletions b/‎src/MaxText/layers/encoders.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/MaxText/multimodal/__init__.py‎
Lines changed: 13 additions & 0 deletions b/‎src/MaxText/multimodal/__init__.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎src/MaxText/multimodal/preprocessor.py‎
Lines changed: 45 additions & 0 deletions b/‎src/MaxText/multimodal/preprocessor.py‎
Lines changed: 45 additions & 0 deletions
@@ -873,6 +873,9 @@ remat_policy_for_vit: "minimal"  # Remat policy for multimodal model's vision en
 image_size_for_vit: 896 # Default for Gemma3, and should be overwritten by model's config
 image_path: "" # Local image path used for decoding, can be multiple paths separated by comma, exp "/path/image1.jpg,/path/image2.jpg"
 image_placeholder: "<|image|>"
+video_path: "" # Local video path used for decoding, can be multiple paths separated by comma, exp "/path/video1.mp4,/path/video2.mp4"
+audio_path: "" # Local audio path used for decoding, can be multiple paths separated by comma, exp "/path/audio1.wav,/path/audio2.wav"
+use_audio_in_video: False
 posemb_type_for_vit: "learn"
 # max_num_images_per_example only applies for training when your image column is a list of images.
 # -1 means no limit, and will pad to the max possible number of images determined by sequence length.
 
@@ -20,7 +20,7 @@ base_emb_dim: 2048
 base_mlp_dim: 768
 base_num_query_heads: 32
 base_num_kv_heads: 4
-base_num_decoder_layers: 48
+base_num_decoder_layers: 1
 head_dim: 128
 mlp_activations: ["silu", "linear"]
 vocab_size: 152064
 
@@ -28,6 +28,7 @@
 from MaxText import pyconfig
 from MaxText import profiler
 from MaxText import multimodal_utils
+from MaxText.multimodal import preprocessor
 # Placeholder: internal
 
 # Number of text sequences to process in a single batch.
@@ -100,14 +101,15 @@ def main(argv: Sequence[str]) -> None:
   prefill_length = config.max_prefill_predict_length
   processor_outputs = multimodal_utils.PreprocessorOutput()
   if config.use_multimodal:
-    image_path = config.image_path.split(",")
-    images = [multimodal_utils.load_image_from_path(p) for p in image_path]
-    processor_outputs = multimodal_utils.pre_process_image(images, model_name=config.model_name)
+    processor_outputs = preprocessor.preprocess_mm_data(config)
     image_offsets = multimodal_utils.get_image_offsets(config.model_name, processor_output=processor_outputs)
 
     prefill_length -= image_offsets
     text = multimodal_utils.reformat_prompt(
-        text, image_placeholder=config.image_placeholder, model_name=config.model_name, num_images=len(images)
+        text,
+        image_placeholder=config.image_placeholder,
+        model_name=config.model_name,
+        num_images=processor_outputs.num_images,
     )
 
   metadata = engine.get_tokenizer()
 
@@ -566,6 +566,8 @@ def _apply_embedding(
             image_masks=image_masks,
         )
       # TODO(hengtaoguo): Add support for other multimodal models such as Llama4, refactor if needed
+      elif cfg.model_name in ["qwen3-omni-30b-a3b"]:
+        pass
       else:
         raise ValueError(f"Unsupported model_name for multimodal: {cfg.model_name}")
 
 
@@ -44,6 +44,10 @@ def get_vision_encoder_layers(self):
       from MaxText.layers import llama4  # pylint: disable=import-outside-toplevel
 
       return [llama4.llama4visionmodel_as_linen, llama4.llama4multimodalprojector_as_linen]
+    elif self.config.model_name in ["qwen3-omni-30b-a3b"]:
+      from MaxText.layers import gemma3  # pylint: disable=import-outside-toplevel
+
+      return [gemma3.gemma3visionencoder_as_linen, gemma3.visionembedder_as_linen]
     else:
       raise ValueError(f"No VisionEncoder implemented for {self.config.model_name} yet")
 
 
@@ -0,0 +1,13 @@
+# Copyright 2023–2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
@@ -0,0 +1,45 @@
+# Copyright 2023–2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from MaxText import multimodal_utils  # TODO(hengtaoguo): deprecate this file and refactor to MaxText/multimodal/utils.py
+
+
+def preprocess_mm_data(config):
+  """Preprocesses multimodal data based on the provided configuration.
+  Routes to the appropriate preprocessing function based on the model name.
+
+  Args:
+    config: A `pyconfig.Config` object containing configuration parameters.
+
+  Returns:
+    A `PreprocessorOutput` object containing the processed multimodal data.
+  """
+  processor_outputs = multimodal_utils.PreprocessorOutput()
+
+  if config.model_name in ["gemma3-4b", "gemma3-12b", "gemma3-27b"]:
+
+    images = [multimodal_utils.load_image_from_path(p) for p in config.image_path.split(",")]
+    processor_outputs = multimodal_utils.pre_process_gemma3_image(images)
+  elif config.model_name in ["llama4-17b-16e", "llama4-17b-128e"]:
+
+    images = [multimodal_utils.load_image_from_path(p) for p in config.image_path.split(",")]
+    processor_outputs = multimodal_utils.pre_process_llama4_image(images)
+  elif config.model_name in ["qwen3-omni-30b-a3b"]:
+    from MaxText.multimodal.qwen3_omni_processor import preprocess_mm_data_qwen3_omni  # pylint: disable=import-outside-toplevel
+
+    processor_outputs = preprocess_mm_data_qwen3_omni(config)
+  else:
+    raise ValueError(f"Model {config.model_name} not supported for multimodal preprocessing.")
+
+  return processor_outputs
Original file line number	Diff line number	Diff line change
`@@ -566,6 +566,8 @@ def _apply_embedding(`
`566`	`566`	`image_masks=image_masks,`
`567`	`567`	`)`
`568`	`568`	`# TODO(hengtaoguo): Add support for other multimodal models such as Llama4, refactor if needed`
	`569`	`+ elif cfg.model_name in ["qwen3-omni-30b-a3b"]:`
	`570`	`+ pass`
`569`	`571`	`else:`
`570`	`572`	`raise ValueError(f"Unsupported model_name for multimodal: {cfg.model_name}")`
`571`	`573`