WIP

SignalRT · SignalRT · commit 4e4eaf9dcc13 · 2025-09-27T18:37:50.000+02:00
diff --git a/LLama/LLamaInteractExecutor.cs b/LLama/LLamaInteractExecutor.cs
@@ -8,6 +8,7 @@
 using System.Text.Json;
 using System.Text.Json.Serialization;
 using System.Threading.Tasks;
+using LLama;
 using LLama.Exceptions;
 using LLama.Sampling;
 using Microsoft.Extensions.Logging;
@@ -21,12 +22,10 @@ namespace LLama
     public class InteractiveExecutor : StatefulExecutorBase
     {
         private bool _is_prompt_run = true;
-        
-        // LLava
-        private int _EmbedImagePosition = -1;
-        // TODO JLS:
-        //private List<SafeMtmdImageEmbedHandle> _imageEmbedHandles = new List<SafeMtmdImageEmbedHandle>();
-        private bool _imageInPrompt = false;
+
+        // MTMD multimodal state
+        private SafeMtmdInputChunks? _mtmdChunks;
+        private string? _mtmdMarker;
 
         /// <summary>
         /// 
@@ -71,6 +70,7 @@ public override ExecutorBaseState GetStateData()
         /// <inheritdoc />
         public override Task LoadState(ExecutorBaseState data)
         {
+            DisposeMtmdChunks();
             if (data is InteractiveExecutorState state)
             {
                 _n_session_consumed = state.ConsumedSessionCount;
@@ -130,7 +130,7 @@ protected override Task PreprocessInputs(string? text, InferStateArgs args)
                 }
                 else
                 {
-                    PreprocessLlava(text, args, true);
+                    PreprocessMtmd(text, args, true);
                 }
             }
             else
@@ -151,51 +151,121 @@ protected override Task PreprocessInputs(string? text, InferStateArgs args)
                     }
                     else
                     {
-                        PreprocessLlava(text, args, false);
+                        PreprocessMtmd(text, args, false);
                     }
                 }
             }
 
             return Task.CompletedTask;
         }
 
+        private void DisposeMtmdChunks()
+        {
+            _mtmdChunks?.Dispose();
+            _mtmdChunks = null;
+        }
+
+        private void DisposeEmbeds()
+        {
+            if (Embeds.Count == 0)
+            {
+                return;
+            }
+
+            foreach (var embed in Embeds)
+            {
+                embed.Dispose();
+            }
+
+            Embeds.Clear();
+        }
+
+        private string GetMtmdMarker()
+        {
+            if (_mtmdMarker is not null)
+            {
+                return _mtmdMarker;
+            }
+
+            _mtmdMarker = NativeApi.MtmdDefaultMarker() ?? "<media>";
+            return _mtmdMarker;
+        }
+
         /// <inheritdoc />
-        private Task PreprocessLlava(string text, InferStateArgs args, bool addBos = true )
-        {   
-            // If the prompt contains the tag <image> extract this.
-            _imageInPrompt = text.Contains("<image>");
-            if (_imageInPrompt && IsMultiModal)
+        private Task PreprocessMtmd(string text, InferStateArgs args, bool addBos = true)
+        {
+            if (ClipModel is null)
+            {
+                throw new InvalidOperationException("Multimodal execution requires a loaded mtmd clip model.");
+            }
+
+            DisposeMtmdChunks();
+
+            var marker = GetMtmdMarker();
+            var prompt = text;
+
+            if (Embeds.Count > 0)
             {
-                foreach (var embed in Embeds)
+                if (prompt.Contains("<image>"))
                 {
-                    // TODO JLS:
-                    //_imageEmbedHandles.Add(SafeMtmdImageEmbedHandle.CreateFromMemory(ClipModel!.NativeHandle, Context, image));
+                    prompt = prompt.Replace("<image>", marker);
                 }
 
-                int imageIndex = text.IndexOf("<image>");
-                // Tokenize segment 1 (before <image> tag)
-                string preImagePrompt = text.Substring(0, imageIndex);
-                var segment1 = Context.Tokenize(preImagePrompt, addBos, true);
-                // Remember the position to add the image embeddings
-                _EmbedImagePosition = segment1.Length;
-                string postImagePrompt = text.Substring(imageIndex + 7);
-                var segment2 = Context.Tokenize(postImagePrompt, false, true);
-                _embed_inps.AddRange(segment1);
-                _embed_inps.AddRange(segment2);
+                if (!prompt.Contains(marker))
+                {
+                    var suffix = string.Concat(Enumerable.Repeat(marker, Embeds.Count));
+                    prompt = string.Concat(prompt, suffix);
+                }
             }
-            else
+
+            SafeMtmdInputChunks? chunks = null;
+            try
             {
+                var status = ClipModel.Tokenize(prompt, addBos, parseSpecial: true, out chunks);
+                if (status != 0 || chunks is null)
+                {
+                    ClipModel.ClearMedia();
+                    throw new RuntimeError($"Failed to tokenize multimodal prompt. Status: {status}.");
+                }
+
+                _mtmdChunks = chunks;
+
+                var tokens = new List<LLamaToken>();
+                foreach (var chunk in chunks.Enumerate())
+                {
+                    using var scopedChunk = chunk;
+                    if (scopedChunk.Type != SafeMtmdInputChunk.SafeMtmdInputChunkType.Text)
+                    {
+                        continue;
+                    }
+
+                    foreach (var token in scopedChunk.GetTextTokensSpan())
+                    {
+                        tokens.Add(unchecked((int)token));
+                    }
+                }
+
                 if (addBos)
                 {
-                    _embed_inps = Context.Tokenize(text, true, true).ToList();
+                    _embed_inps = tokens;
                 }
                 else
                 {
-                    var line_inp = Context.Tokenize(text, false, true);
-                    _embed_inps.AddRange(line_inp);
-                    args.RemainedTokens -= line_inp.Length;                    
+                    _embed_inps.AddRange(tokens);
+                    args.RemainedTokens -= tokens.Count;
                 }
             }
+            catch
+            {
+                chunks?.Dispose();
+                _mtmdChunks = null;
+                throw;
+            }
+            finally
+            {
+                DisposeEmbeds();
+            }
+
             return Task.CompletedTask;
         }
 
@@ -255,49 +325,60 @@ protected override async Task InferInternal(IInferenceParams inferenceParams, In
                     HandleRunOutOfContext(tokensToKeep);
                 }
 
-                TryReuseMatchingPrefix();
+                if (_mtmdChunks is null)
+                {
+                    TryReuseMatchingPrefix();
+                }
 
-                // Changes to support Multi-Modal LLMs.
-                //
-                (DecodeResult, int, int) header, end, result;
-                if (IsMultiModal &&  _EmbedImagePosition > 0)
+                if (IsMultiModal && _mtmdChunks is not null)
                 {
-                    // Tokens previous to the images
-                    header = await Context.DecodeAsync(_embeds.GetRange(0, _EmbedImagePosition), LLamaSeqId.Zero, batch, _pastTokensCount);
-                    _pastTokensCount = header.Item3;
-
-                    if (header.Item1 != DecodeResult.Ok) throw new LLamaDecodeError(header.Item1);
-                   
-                    // TODO JLS:
-                    // Images
-                    //foreach( var image in _imageEmbedHandles )
-                    //    ClipModel!.EvalImageEmbed(Context, image, ref _pastTokensCount);
-                        
-                    // Post-image Tokens
-                    end = await Context.DecodeAsync(_embeds.GetRange(_EmbedImagePosition, _embeds.Count - _EmbedImagePosition), LLamaSeqId.Zero, batch, _pastTokensCount);
-                    _pastTokensCount = end.Item3;
-
-                    _EmbedImagePosition = -1;
-                    // TODO JLS:
-                    //_imageEmbedHandles.Clear();
-                    Embeds.Clear();
+                    var nPast = (long)_pastTokensCount;
+                    var evalStatus = ClipModel!.EvaluateChunks(_mtmdChunks, Context.NativeHandle, ref nPast, seqId: 0,
+                        nBatch: checked((int)Context.BatchSize), logitsLast: true);
+                    if (evalStatus != 0)
+                    {
+                        DisposeMtmdChunks();
+                        throw new RuntimeError($"Failed to evaluate multimodal chunks. Status: {evalStatus}.");
+                    }
+
+                    _pastTokensCount = checked((int)nPast);
+                    DisposeMtmdChunks();
+
+                    if (_embeds.Count > 0 && !string.IsNullOrEmpty(_pathSession))
+                    {
+                        _session_tokens.AddRange(_embeds);
+                        _n_session_consumed = _session_tokens.Count;
+                    }
                 }
                 else
                 {
-                    result = await Context.DecodeAsync(_embeds, LLamaSeqId.Zero, batch, _pastTokensCount);
+                    var result = await Context.DecodeAsync(_embeds, LLamaSeqId.Zero, batch, _pastTokensCount);
                     _pastTokensCount = result.Item3;
 
                     if (result.Item1 != DecodeResult.Ok) throw new LLamaDecodeError(result.Item1);
-                }
-                
 
-                if (_embeds.Count > 0 && !string.IsNullOrEmpty(_pathSession))
-                {
-                    _session_tokens.AddRange(_embeds);
-                    _n_session_consumed = _session_tokens.Count;
+                    if (_embeds.Count > 0 && !string.IsNullOrEmpty(_pathSession))
+                    {
+                        _session_tokens.AddRange(_embeds);
+                        _n_session_consumed = _session_tokens.Count;
+                    }
                 }
             }
+            else if (IsMultiModal && _mtmdChunks is not null)
+            {
+                _is_prompt_run = false;
+                var nPast = (long)_pastTokensCount;
+                var evalStatus = ClipModel!.EvaluateChunks(_mtmdChunks, Context.NativeHandle, ref nPast, seqId: 0, nBatch: checked((int)Context.BatchSize), logitsLast: true);
+                if (evalStatus != 0)
+                {
+                    DisposeMtmdChunks();
+                    throw new RuntimeError($"Failed to evaluate multimodal chunks. Status: {evalStatus}.");
+                }
 
+                _pastTokensCount = checked((int)nPast);
+                DisposeMtmdChunks();
+            }
+            
             _embeds.Clear();
 
             if (_embed_inps.Count <= _consumedTokensCount && !args.WaitForInput)
@@ -351,7 +432,7 @@ protected override async Task InferInternal(IInferenceParams inferenceParams, In
         /// The descriptor of the state of the interactive executor.
         /// </summary>
         public class InteractiveExecutorState
-            : ExecutorBaseState
+            : StatefulExecutorBase.ExecutorBaseState
         {
             /// <summary>
             /// Whether the executor is running for the first time (running the prompt).
diff --git a/LLama/Native/SafeMtmdInputChunks.cs b/LLama/Native/SafeMtmdInputChunks.cs
@@ -1,4 +1,5 @@
 using System;
+using System.Collections.Generic;
 
 namespace LLama.Native;
 
@@ -34,4 +35,20 @@ public IntPtr GetChunkPtr(ulong index)
         if (index >= Size) throw new IndexOutOfRangeException();
         return NativeApi.mtmd_input_chunks_get(NativePtr, (UIntPtr)index);
     }
-}
+
+    /// <summary>
+    /// Enumerate the contained chunks as non-owning wrappers.
+    /// Callers should dispose the returned chunk if they create a copy.
+    /// </summary>
+    public IEnumerable<SafeMtmdInputChunk> Enumerate()
+    {
+        for (ulong i = 0; i < Size; i++)
+        {
+            var chunk = SafeMtmdInputChunk.Wrap(GetChunkPtr(i));
+            if (chunk != null)
+            {
+                yield return chunk;
+            }
+        }
+    }
+}