@@ -86,6 +86,13 @@ public bool IsMultiModal
8686 /// <inheritdoc />
8787 public List < SafeMtmdEmbed > Embeds { get ; }
8888
89+ /// <summary>
90+ /// Pending multimodal chunks produced by the MTMD tokenizer.
91+ /// </summary>
92+ protected SafeMtmdInputChunks ? MtmdChunks { get ; set ; }
93+
94+ private string ? _mtmdMarker ;
95+
8996 private readonly StreamingTokenDecoder _decoder ;
9097
9198 /// <summary>
@@ -242,6 +249,194 @@ protected virtual void TryReuseMatchingPrefix()
242249 }
243250 }
244251
252+ /// <summary>
253+ /// Dispose and clear any queued multimodal chunk collection.
254+ /// </summary>
255+ protected void DisposeMtmdChunks ( )
256+ {
257+ MtmdChunks ? . Dispose ( ) ;
258+ MtmdChunks = null ;
259+ }
260+
261+ /// <summary>
262+ /// Dispose and clear any pending multimodal embeddings.
263+ /// </summary>
264+ protected void DisposeEmbeds ( )
265+ {
266+ if ( Embeds . Count == 0 )
267+ return ;
268+
269+ foreach ( var embed in Embeds )
270+ embed . Dispose ( ) ;
271+
272+ Embeds . Clear ( ) ;
273+ }
274+
275+ /// <summary>
276+ /// Retrieve the marker token used to signal media segments to the tokenizer.
277+ /// </summary>
278+ protected string GetMtmdMarker ( )
279+ {
280+ if ( _mtmdMarker is not null )
281+ return _mtmdMarker ;
282+
283+ _mtmdMarker = NativeApi . MtmdDefaultMarker ( ) ?? "<media>" ;
284+ return _mtmdMarker ;
285+ }
286+
287+ /// <summary>
288+ /// Ensure the token list fills all positional slots reported by the MTMD helper.
289+ /// </summary>
290+ protected static List < LLamaToken > BuildTokensWithFiller ( List < LLamaToken > tokens , int totalPositions , LLamaToken fillerToken )
291+ {
292+ if ( totalPositions <= tokens . Count )
293+ return new List < LLamaToken > ( tokens ) ;
294+
295+ var result = new List < LLamaToken > ( totalPositions ) ;
296+ result . AddRange ( tokens ) ;
297+ result . AddRange ( Enumerable . Repeat ( fillerToken , totalPositions - tokens . Count ) ) ;
298+ return result ;
299+ }
300+
301+ /// <summary>
302+ /// Resolve the fallback token inserted when the tokenizer emits fewer tokens than positions.
303+ /// </summary>
304+ protected LLamaToken GetFillerToken ( string marker )
305+ {
306+ var markerTokens = Context . Tokenize ( marker , false , true ) ;
307+ if ( markerTokens . Length > 0 )
308+ return markerTokens [ markerTokens . Length - 1 ] ;
309+
310+ var eos = Context . Vocab . EOS ;
311+ if ( eos . HasValue )
312+ return eos . Value ;
313+
314+ return default ;
315+ }
316+
317+ /// <summary>
318+ /// Prepare multimodal inputs by invoking the MTMD tokenizer and aligning filler tokens.
319+ /// </summary>
320+ protected Task PreprocessMtmd ( string text , InferStateArgs args , bool addBos , bool replaceExisting )
321+ {
322+ if ( ClipModel is null )
323+ throw new InvalidOperationException ( "Multimodal execution requires a loaded mtmd clip model." ) ;
324+
325+ DisposeMtmdChunks ( ) ;
326+
327+ var marker = GetMtmdMarker ( ) ;
328+ var prompt = text ;
329+
330+ if ( Embeds . Count > 0 )
331+ {
332+ if ( prompt . Contains ( "<image>" ) )
333+ prompt = prompt . Replace ( "<image>" , marker ) ;
334+
335+ if ( ! prompt . Contains ( marker ) )
336+ {
337+ var suffix = string . Concat ( Enumerable . Repeat ( marker , Embeds . Count ) ) ;
338+ prompt = string . Concat ( prompt , suffix ) ;
339+ }
340+ }
341+
342+ SafeMtmdInputChunks ? chunks = null ;
343+ try
344+ {
345+ var status = ClipModel . Tokenize ( prompt , addBos , parseSpecial : true , out chunks ) ;
346+ if ( status != 0 || chunks is null )
347+ {
348+ ClipModel . ClearMedia ( ) ;
349+ throw new RuntimeError ( $ "Failed to tokenize multimodal prompt. Status: { status } .") ;
350+ }
351+
352+ MtmdChunks = chunks ;
353+
354+ var tokens = new List < LLamaToken > ( ) ;
355+ foreach ( var chunk in chunks . Enumerate ( ) )
356+ {
357+ using var scopedChunk = chunk ;
358+ if ( scopedChunk . Type != SafeMtmdInputChunk . SafeMtmdInputChunkType . Text )
359+ continue ;
360+
361+ foreach ( var token in scopedChunk . GetTextTokensSpan ( ) )
362+ tokens . Add ( unchecked ( ( int ) token ) ) ;
363+ }
364+
365+ var totalPositions = ( int ) ClipModel . CountPositions ( chunks ) ;
366+ var fillerToken = GetFillerToken ( marker ) ;
367+
368+ if ( replaceExisting )
369+ {
370+ _embed_inps = BuildTokensWithFiller ( tokens , totalPositions , fillerToken ) ;
371+ _consumedTokensCount = 0 ;
372+ }
373+ else
374+ {
375+ if ( _embed_inps . Count == 0 )
376+ _embed_inps = new List < LLamaToken > ( ) ;
377+
378+ _embed_inps . AddRange ( tokens ) ;
379+ var fillerCount = totalPositions - tokens . Count ;
380+ if ( fillerCount > 0 )
381+ _embed_inps . AddRange ( Enumerable . Repeat ( fillerToken , fillerCount ) ) ;
382+
383+ args . RemainedTokens -= tokens . Count ;
384+ }
385+ }
386+ catch
387+ {
388+ chunks ? . Dispose ( ) ;
389+ MtmdChunks = null ;
390+ throw ;
391+ }
392+ finally
393+ {
394+ DisposeEmbeds ( ) ;
395+ }
396+
397+ return Task . CompletedTask ;
398+ }
399+
400+ /// <summary>
401+ /// Apply bookkeeping after successfully evaluating multimodal chunks.
402+ /// </summary>
403+ protected void FinalizeMtmdEvaluation ( long newNPast , int previousConsumed )
404+ {
405+ _pastTokensCount = checked ( ( int ) newNPast ) ;
406+ DisposeMtmdChunks ( ) ;
407+
408+ if ( ! string . IsNullOrEmpty ( _pathSession ) && _embed_inps . Count > previousConsumed )
409+ {
410+ _session_tokens . AddRange ( _embed_inps . Skip ( previousConsumed ) ) ;
411+ _n_session_consumed = _session_tokens . Count ;
412+ }
413+
414+ _consumedTokensCount = _embed_inps . Count ;
415+ _embeds . Clear ( ) ;
416+ }
417+
418+ /// <summary>
419+ /// Evaluate the queued MTMD chunks and update executor state.
420+ /// </summary>
421+ protected void EvaluateMtmdChunks ( ref long nPast , int previousConsumed , string executorName )
422+ {
423+ if ( ClipModel is null )
424+ throw new InvalidOperationException ( "Multimodal execution requires a loaded mtmd clip model." ) ;
425+ if ( MtmdChunks is null )
426+ throw new InvalidOperationException ( "No MTMD chunks are queued for evaluation." ) ;
427+
428+ var evalStatus = ClipModel . EvaluateChunks ( MtmdChunks , Context . NativeHandle , ref nPast , seqId : 0 ,
429+ nBatch : checked ( ( int ) Context . BatchSize ) , logitsLast : true ) ;
430+ if ( evalStatus != 0 )
431+ {
432+ _logger ? . LogError ( "[{Executor}] Failed to evaluate multimodal chunks. Status: {Status}" , executorName , evalStatus ) ;
433+ DisposeMtmdChunks ( ) ;
434+ throw new RuntimeError ( $ "Failed to evaluate multimodal chunks. Status: { evalStatus } .") ;
435+ }
436+
437+ FinalizeMtmdEvaluation ( nPast , previousConsumed ) ;
438+ }
439+
245440 /// <summary>
246441 /// Determine whether the inference loop should continue processing tokens.
247442 /// </summary>
0 commit comments