AI-Hypercomputer
diff --git a/‎src/MaxText/configs/sft-vision-slidevqa.yml‎
Lines changed: 33 additions & 0 deletions b/‎src/MaxText/configs/sft-vision-slidevqa.yml‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎src/MaxText/decode.py‎
Lines changed: 5 additions & 9 deletions b/‎src/MaxText/decode.py‎
Lines changed: 5 additions & 9 deletions
diff --git a/‎src/MaxText/input_pipeline/_hf_data_processing.py‎
Lines changed: 13 additions & 0 deletions b/‎src/MaxText/input_pipeline/_hf_data_processing.py‎
Lines changed: 13 additions & 0 deletions
@@ -0,0 +1,33 @@
+# Copyright 2023–2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+base_config: "base.yml"
+
+use_sft: True
+use_multimodal: True
+# For vision, the prompt contains image, we only train on completion tokens
+sft_train_on_completion_only: True
+packing: False  # packing is not supported yet
+freeze_vision_encoder_params: True
+learning_rate: 2.e-5
+
+# -------------- HF pipeline --------------
+dataset_type: hf
+hf_path: 'NTT-hil-insight/SlideVQA'
+train_split: 'train'
+hf_eval_split: 'val'
+train_data_columns: ['question', 'answer']  # the first column is prompt, second column is completion
+eval_data_columns: ['question', 'answer']  # the first column is prompt, second column is completion
+train_image_column: ['page_1', 'page_2', 'page_3', 'page_4', 'page_5', 'page_6', 'page_7', 'page_8', 'page_9', 'page_10', 'page_11', 'page_12', 'page_13', 'page_14', 'page_15', 'page_16', 'page_17', 'page_18', 'page_19', 'page_20']  # list of image columns
+eval_image_column: ['page_1', 'page_2', 'page_3', 'page_4', 'page_5', 'page_6', 'page_7', 'page_8', 'page_9', 'page_10', 'page_11', 'page_12', 'page_13', 'page_14', 'page_15', 'page_16', 'page_17', 'page_18', 'page_19', 'page_20']  # list of image columns
@@ -16,7 +16,6 @@
 
 import os
 from typing import Sequence
-import numpy as np
 import jax
 import jax.numpy as jnp
 
@@ -103,10 +102,9 @@ def main(argv: Sequence[str]) -> None:
   if config.use_multimodal:
     image_path = config.image_path.split(",")
     images = [multimodal_utils.load_image_from_path(p) for p in image_path]
-    processor_outputs = [multimodal_utils.pre_process_image(img, model_name=config.model_name) for img in images]
-    image_offsets = sum(
-        multimodal_utils.get_image_offsets(config.model_name, processor_output=po) for po in processor_outputs
-    )
+    processor_outputs = multimodal_utils.pre_process_image(images, model_name=config.model_name)
+    image_offsets = multimodal_utils.get_image_offsets(config.model_name, processor_output=processor_outputs)
+
     prefill_length -= image_offsets
     text = multimodal_utils.reformat_prompt(
         text, image_placeholder=config.image_placeholder, model_name=config.model_name, num_images=len(images)
@@ -150,10 +148,8 @@ def main(argv: Sequence[str]) -> None:
       prefill_result, first_token = engine.prefill(
           params=params,
           padded_tokens=tokens,
-          images=np.stack([po.pixel_values for po in processor_outputs]) if config.use_multimodal else None,
-          image_masks=np.stack([po.pixel_mask for po in processor_outputs])
-          if config.use_multimodal and "llama4" in config.model_name
-          else None,
+          images=processor_outputs.pixel_values if config.use_multimodal else None,
+          image_masks=processor_outputs.pixel_mask if config.use_multimodal and "llama4" in config.model_name else None,
           true_length=true_length,
           rng=rng_prefill,
           slot=i,
 
@@ -47,6 +47,17 @@ def vision_sft_preprocessing_pipeline(
   if config.enable_data_shuffling:
     dataset = dataset.shuffle(seed=config.data_shuffle_seed)
 
+  # If multiple image columns are provided, merge them into a single 'images' column.
+  if isinstance(image_column, list):
+    dataset = dataset.map(
+        _input_pipeline_utils.merge_image_columns,
+        fn_kwargs={
+            "image_columns": image_column,
+            "max_num_images_per_example": config.max_num_images_per_example,
+        },
+        remove_columns=image_column,  # Drop the original image columns
+    )
+
   dataset = dataset.select_columns(text_columns + [image_column])
   if image_column != "images":
     dataset = dataset.rename_column(image_column, "images")
@@ -125,7 +136,9 @@ def vision_sft_preprocessing_pipeline(
           max_num_images_per_example=config.max_num_images_per_example,
       )
   )
+  operations.append(_input_pipeline_utils.ExtractImagesAndMasks())
   operations.append(grain.Batch(batch_size=batch_size, drop_remainder=True))
+  operations.append(_input_pipeline_utils.FoldImagesIntoBatch(model_name=config.model_name))
   operations.append(_input_pipeline_utils.ShiftData(ignored_ids=[pad_id], axis=1))
   dummy_index_sampler = grain.IndexSampler(
       num_records=len(dataset),