@@ -139,16 +139,12 @@ def process_conversation_inputs(
139139 input_conversation : List [Dict [str , Any ]],
140140):
141141 """
142- Process input conversation for multimodal models.
143-
144- This function handles the preprocessing of conversation inputs, with special handling for
145- GraniteSpeechProcessor which requires extracting and processing audio content from conversations
146- prior to feeding into the processor.
142+ Process an input conversation into tensor inputs for multimodal models.
147143
148144 Args:
149145 processor: The processor to use for input processing
150146 tokenizer: The tokenizer to use for text processing
151- input_conversation: List of conversation messages, may contain audio content
147+ input_conversation: List of conversation messages
152148
153149 Returns:
154150 Processed inputs ready for model consumption
@@ -190,6 +186,34 @@ def process_conversation_inputs(
190186 # Generate text prompt and process with audio
191187 prompt = tokenizer .apply_chat_template (conversation , tokenize = False , add_generation_prompt = True )
192188 inputs = processor (prompt , wav , return_tensors = "pt" )
189+ elif isinstance (processor , transformers .SmolVLMProcessor ):
190+ from transformers .image_utils import load_image
191+
192+ conversation = copy .deepcopy (input_conversation )
193+ images = []
194+
195+ # Extract image URLs from conversation
196+ for message in conversation :
197+ if isinstance (message .get ("content" ), list ):
198+ # Filter out image entries and collect URLs
199+ image_urls = [item ["url" ] for item in message ["content" ] if item .get ("type" ) == "image" ]
200+ images .extend ([load_image (url ) for url in image_urls ])
201+
202+ # Remove image entries from content
203+ message ["content" ] = [item for item in message ["content" ] if item .get ("type" ) != "image" ]
204+
205+ # Apply chat template to get text prompt
206+ prompt = apply_chat_template_with_fallback (
207+ processor ,
208+ conversation ,
209+ add_generation_prompt = True ,
210+ tokenize = True ,
211+ return_dict = True ,
212+ return_tensors = "pt" ,
213+ )
214+
215+ # Process with text and images
216+ inputs = processor (text = prompt , images = images , return_tensors = "pt" )
193217 else :
194218 # Standard processing for other processors
195219 inputs = apply_chat_template_with_fallback (
0 commit comments