@@ -81,6 +81,13 @@ public bool IsMultiModal
8181 /// <inheritdoc />
8282 public List < SafeMtmdEmbed > Embeds { get ; }
8383
84+ /// <summary>
85+ /// Pending multimodal chunks produced by the MTMD tokenizer.
86+ /// </summary>
87+ protected SafeMtmdInputChunks ? MtmdChunks { get ; set ; }
88+
89+ private string ? _mtmdMarker ;
90+
8491 private readonly StreamingTokenDecoder _decoder ;
8592
8693 /// <summary>
@@ -235,6 +242,194 @@ protected virtual void TryReuseMatchingPrefix()
235242 }
236243 }
237244
245+ /// <summary>
246+ /// Dispose and clear any queued multimodal chunk collection.
247+ /// </summary>
248+ protected void DisposeMtmdChunks ( )
249+ {
250+ MtmdChunks ? . Dispose ( ) ;
251+ MtmdChunks = null ;
252+ }
253+
254+ /// <summary>
255+ /// Dispose and clear any pending multimodal embeddings.
256+ /// </summary>
257+ protected void DisposeEmbeds ( )
258+ {
259+ if ( Embeds . Count == 0 )
260+ return ;
261+
262+ foreach ( var embed in Embeds )
263+ embed . Dispose ( ) ;
264+
265+ Embeds . Clear ( ) ;
266+ }
267+
268+ /// <summary>
269+ /// Retrieve the marker token used to signal media segments to the tokenizer.
270+ /// </summary>
271+ protected string GetMtmdMarker ( )
272+ {
273+ if ( _mtmdMarker is not null )
274+ return _mtmdMarker ;
275+
276+ _mtmdMarker = NativeApi . MtmdDefaultMarker ( ) ?? "<media>" ;
277+ return _mtmdMarker ;
278+ }
279+
280+ /// <summary>
281+ /// Ensure the token list fills all positional slots reported by the MTMD helper.
282+ /// </summary>
283+ protected static List < LLamaToken > BuildTokensWithFiller ( List < LLamaToken > tokens , int totalPositions , LLamaToken fillerToken )
284+ {
285+ if ( totalPositions <= tokens . Count )
286+ return new List < LLamaToken > ( tokens ) ;
287+
288+ var result = new List < LLamaToken > ( totalPositions ) ;
289+ result . AddRange ( tokens ) ;
290+ result . AddRange ( Enumerable . Repeat ( fillerToken , totalPositions - tokens . Count ) ) ;
291+ return result ;
292+ }
293+
294+ /// <summary>
295+ /// Resolve the fallback token inserted when the tokenizer emits fewer tokens than positions.
296+ /// </summary>
297+ protected LLamaToken GetFillerToken ( string marker )
298+ {
299+ var markerTokens = Context . Tokenize ( marker , false , true ) ;
300+ if ( markerTokens . Length > 0 )
301+ return markerTokens [ markerTokens . Length - 1 ] ;
302+
303+ var eos = Context . Vocab . EOS ;
304+ if ( eos . HasValue )
305+ return eos . Value ;
306+
307+ return default ;
308+ }
309+
310+ /// <summary>
311+ /// Prepare multimodal inputs by invoking the MTMD tokenizer and aligning filler tokens.
312+ /// </summary>
313+ protected Task PreprocessMtmd ( string text , InferStateArgs args , bool addBos , bool replaceExisting )
314+ {
315+ if ( ClipModel is null )
316+ throw new InvalidOperationException ( "Multimodal execution requires a loaded mtmd clip model." ) ;
317+
318+ DisposeMtmdChunks ( ) ;
319+
320+ var marker = GetMtmdMarker ( ) ;
321+ var prompt = text ;
322+
323+ if ( Embeds . Count > 0 )
324+ {
325+ if ( prompt . Contains ( "<image>" ) )
326+ prompt = prompt . Replace ( "<image>" , marker ) ;
327+
328+ if ( ! prompt . Contains ( marker ) )
329+ {
330+ var suffix = string . Concat ( Enumerable . Repeat ( marker , Embeds . Count ) ) ;
331+ prompt = string . Concat ( prompt , suffix ) ;
332+ }
333+ }
334+
335+ SafeMtmdInputChunks ? chunks = null ;
336+ try
337+ {
338+ var status = ClipModel . Tokenize ( prompt , addBos , parseSpecial : true , out chunks ) ;
339+ if ( status != 0 || chunks is null )
340+ {
341+ ClipModel . ClearMedia ( ) ;
342+ throw new RuntimeError ( $ "Failed to tokenize multimodal prompt. Status: { status } .") ;
343+ }
344+
345+ MtmdChunks = chunks ;
346+
347+ var tokens = new List < LLamaToken > ( ) ;
348+ foreach ( var chunk in chunks . Enumerate ( ) )
349+ {
350+ using var scopedChunk = chunk ;
351+ if ( scopedChunk . Type != SafeMtmdInputChunk . SafeMtmdInputChunkType . Text )
352+ continue ;
353+
354+ foreach ( var token in scopedChunk . GetTextTokensSpan ( ) )
355+ tokens . Add ( unchecked ( ( int ) token ) ) ;
356+ }
357+
358+ var totalPositions = ( int ) ClipModel . CountPositions ( chunks ) ;
359+ var fillerToken = GetFillerToken ( marker ) ;
360+
361+ if ( replaceExisting )
362+ {
363+ _embed_inps = BuildTokensWithFiller ( tokens , totalPositions , fillerToken ) ;
364+ _consumedTokensCount = 0 ;
365+ }
366+ else
367+ {
368+ if ( _embed_inps . Count == 0 )
369+ _embed_inps = new List < LLamaToken > ( ) ;
370+
371+ _embed_inps . AddRange ( tokens ) ;
372+ var fillerCount = totalPositions - tokens . Count ;
373+ if ( fillerCount > 0 )
374+ _embed_inps . AddRange ( Enumerable . Repeat ( fillerToken , fillerCount ) ) ;
375+
376+ args . RemainedTokens -= tokens . Count ;
377+ }
378+ }
379+ catch
380+ {
381+ chunks ? . Dispose ( ) ;
382+ MtmdChunks = null ;
383+ throw ;
384+ }
385+ finally
386+ {
387+ DisposeEmbeds ( ) ;
388+ }
389+
390+ return Task . CompletedTask ;
391+ }
392+
393+ /// <summary>
394+ /// Apply bookkeeping after successfully evaluating multimodal chunks.
395+ /// </summary>
396+ protected void FinalizeMtmdEvaluation ( long newNPast , int previousConsumed )
397+ {
398+ _pastTokensCount = checked ( ( int ) newNPast ) ;
399+ DisposeMtmdChunks ( ) ;
400+
401+ if ( ! string . IsNullOrEmpty ( _pathSession ) && _embed_inps . Count > previousConsumed )
402+ {
403+ _session_tokens . AddRange ( _embed_inps . Skip ( previousConsumed ) ) ;
404+ _n_session_consumed = _session_tokens . Count ;
405+ }
406+
407+ _consumedTokensCount = _embed_inps . Count ;
408+ _embeds . Clear ( ) ;
409+ }
410+
411+ /// <summary>
412+ /// Evaluate the queued MTMD chunks and update executor state.
413+ /// </summary>
414+ protected void EvaluateMtmdChunks ( ref long nPast , int previousConsumed , string executorName )
415+ {
416+ if ( ClipModel is null )
417+ throw new InvalidOperationException ( "Multimodal execution requires a loaded mtmd clip model." ) ;
418+ if ( MtmdChunks is null )
419+ throw new InvalidOperationException ( "No MTMD chunks are queued for evaluation." ) ;
420+
421+ var evalStatus = ClipModel . EvaluateChunks ( MtmdChunks , Context . NativeHandle , ref nPast , seqId : 0 ,
422+ nBatch : checked ( ( int ) Context . BatchSize ) , logitsLast : true ) ;
423+ if ( evalStatus != 0 )
424+ {
425+ _logger ? . LogError ( "[{Executor}] Failed to evaluate multimodal chunks. Status: {Status}" , executorName , evalStatus ) ;
426+ DisposeMtmdChunks ( ) ;
427+ throw new RuntimeError ( $ "Failed to evaluate multimodal chunks. Status: { evalStatus } .") ;
428+ }
429+
430+ FinalizeMtmdEvaluation ( nPast , previousConsumed ) ;
431+ }
432+
238433 /// <summary>
239434 /// Determine whether the inference loop should continue processing tokens.
240435 /// </summary>
0 commit comments