Add Preprocessing and token placeholders

Eitan Porat · Eitan Porat · commit b816da675cb3 · 2025-11-20T19:50:29.000Z
diff --git a/src/MaxText/configs/base.yml b/src/MaxText/configs/base.yml
@@ -892,7 +892,10 @@ dtype_mm: "float32"  # Data type for multimodal model's vision encoder
 remat_policy_for_vit: "minimal"  # Remat policy for multimodal model's vision encoder. Check `remat_policy` for options.
 image_size_for_vit: 896 # Default for Gemma3, and should be overwritten by model's config
 image_path: "" # Local image path used for decoding, can be multiple paths separated by comma, exp "/path/image1.jpg,/path/image2.jpg"
+audio_path: "" # Local audio path used for decoding, can be multiple paths separated by comma, exp "/path/audio1.wav,/path/audio2.wav"
+video_path: "" # Local video path used for decoding, can be multiple paths separated by comma, exp "/path/video1.mp4,/path/video2.mp4"
 image_placeholder: "<|image|>"
+audio_placeholder: "<|audio|>"
 posemb_type_for_vit: "learn"
 # max_num_images_per_example only applies for training when your image column is a list of images.
 # -1 means no limit, and will pad to the max possible number of images determined by sequence length.
diff --git a/src/MaxText/decode.py b/src/MaxText/decode.py
@@ -99,15 +99,17 @@ def main(argv: Sequence[str]) -> None:
   text = config.prompt
   prefill_length = config.max_prefill_predict_length
   processor_outputs = multimodal_utils.PreprocessorOutput()
+
   if config.use_multimodal:
-    image_path = config.image_path.split(",")
-    images = [multimodal_utils.load_image_from_path(p) for p in image_path]
-    processor_outputs = multimodal_utils.pre_process_image(images, model_name=config.model_name)
+    processor_outputs = multimodal_utils.preprocess_mm_data(config)
     image_offsets = multimodal_utils.get_image_offsets(config.model_name, processor_output=processor_outputs)
 
     prefill_length -= image_offsets
     text = multimodal_utils.reformat_prompt(
-        text, image_placeholder=config.image_placeholder, model_name=config.model_name, num_images=len(images)
+        text,
+        image_placeholder=config.image_placeholder,
+        model_name=config.model_name,
+        num_images=processor_outputs.num_images,
     )
 
   metadata = engine.get_tokenizer()
diff --git a/src/MaxText/multimodal_utils.py b/src/MaxText/multimodal_utils.py