Manual progress

jackzhxng · jackzhxng · commit 4b7520fa4f2f · 2025-10-08T11:43:22.000-07:00
diff --git a/optimum/exporters/executorch/integrations.py b/optimum/exporters/executorch/integrations.py
@@ -22,6 +22,7 @@
 from transformers import (
     AutoConfig,
     AutoProcessor,
+    AutoTokenizer,
     PreTrainedModel,
     StaticCache,
     T5ForConditionalGeneration,
@@ -34,7 +35,7 @@
 
 from optimum.executorch.attentions.custom_sdpa import get_custom_sdpa_for_ring_kv_cache
 
-from .utils import apply_chat_template_with_fallback, save_config_to_constant_methods
+from .utils import apply_chat_template_with_fallback, process_conversation_inputs, save_config_to_constant_methods
 
 
 class VisionExportableModule(torch.nn.Module):
@@ -46,6 +47,7 @@ def prepare_export_inputs(self):
         # 1. Get export inputs
         model_id = self.model.config.name_or_path
         processor = AutoProcessor.from_pretrained(model_id)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
         sample_conversation_with_image = [
             {
                 "role": "user",
@@ -54,13 +56,18 @@ def prepare_export_inputs(self):
                 ],
             },
         ]
-        processed_inputs = processor.apply_chat_template(
+        processed_inputs = process_conversation_inputs(
+            processor,
+            tokenizer,
             sample_conversation_with_image,
-            add_generation_prompt=True,
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt",
         )
+        # processed_inputs = processor.apply_chat_template(
+        #     sample_conversation_with_image,
+        #     add_generation_prompt=True,
+        #     tokenize=True,
+        #     return_dict=True,
+        #     return_tensors="pt",
+        # )
         if "pixel_values" not in processed_inputs:
             raise ValueError(
                 f"Unable to obtain sample audio encoder inputs for export for {model_id} - the processor did not return formatted inputs with the 'pixel_values' key: {processed_inputs}"
diff --git a/optimum/exporters/executorch/tasks/multimodal_text_to_text.py b/optimum/exporters/executorch/tasks/multimodal_text_to_text.py
@@ -180,8 +180,19 @@ def load_multimodal_text_to_text_model(model_name_or_path: str, **kwargs):
             "device": device,
         },
     )
-    decoder_name, audio_encoder_name, vision_encoder_name = _validate_multimodal_components(eager_model)
-    encoder_name = audio_encoder_name if audio_encoder_name else vision_encoder_name
+
+    # Most <Model>ForConditionalGeneration> will have the text_model and encoder models as attributes, however
+    # some have `self.model = <Model>` (the base version not for conditional generation), and this `self.model`
+    # contains the text_model and encoder model attributes.
+    if hasattr(eager_model, "model"):
+        decoder_name, audio_encoder_name, vision_encoder_name = _validate_multimodal_components(eager_model.model)
+        # Set these as top level attributes.
+        setattr(eager_model, decoder_name, getattr(eager_model.model, decoder_name))
+        encoder_name = audio_encoder_name if audio_encoder_name else vision_encoder_name
+        setattr(eager_model, encoder_name, getattr(eager_model.model, encoder_name))
+    else:
+        decoder_name, audio_encoder_name, vision_encoder_name = _validate_multimodal_components(eager_model)
+        encoder_name = audio_encoder_name if audio_encoder_name else vision_encoder_name
 
     # Need to do this since apparently when nested modules (e.g. model.language_model) access the .property
     # config, it always comes from the generation_config.json file, not the `generation_config` override
diff --git a/optimum/exporters/executorch/utils.py b/optimum/exporters/executorch/utils.py
@@ -139,16 +139,12 @@ def process_conversation_inputs(
     input_conversation: List[Dict[str, Any]],
 ):
     """
-    Process input conversation for multimodal models.
-
-    This function handles the preprocessing of conversation inputs, with special handling for
-    GraniteSpeechProcessor which requires extracting and processing audio content from conversations
-    prior to feeding into the processor.
+    Process an input conversation into tensor inputs for multimodal models.
 
     Args:
         processor: The processor to use for input processing
         tokenizer: The tokenizer to use for text processing
-        input_conversation: List of conversation messages, may contain audio content
+        input_conversation: List of conversation messages
 
     Returns:
         Processed inputs ready for model consumption
@@ -190,6 +186,34 @@ def process_conversation_inputs(
         # Generate text prompt and process with audio
         prompt = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
         inputs = processor(prompt, wav, return_tensors="pt")
+    elif isinstance(processor, transformers.SmolVLMProcessor):
+        from transformers.image_utils import load_image
+
+        conversation = copy.deepcopy(input_conversation)
+        images = []
+
+        # Extract image URLs from conversation
+        for message in conversation:
+            if isinstance(message.get("content"), list):
+                # Filter out image entries and collect URLs
+                image_urls = [item["url"] for item in message["content"] if item.get("type") == "image"]
+                images.extend([load_image(url) for url in image_urls])
+
+                # Remove image entries from content
+                message["content"] = [item for item in message["content"] if item.get("type") != "image"]
+
+        # Apply chat template to get text prompt
+        prompt = apply_chat_template_with_fallback(
+            processor,
+            conversation,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        )
+
+        # Process with text and images
+        inputs = processor(text=prompt, images=images, return_tensors="pt")
     else:
         # Standard processing for other processors
         inputs = apply_chat_template_with_fallback(