WIP

SignalRT · SignalRT · commit 78f61376e865 · 2025-09-27T18:37:50.000+02:00
diff --git a/LLama.Examples/ExampleRunner.cs b/LLama.Examples/ExampleRunner.cs
@@ -15,7 +15,7 @@ public class ExampleRunner
         { "Chat Session: Automatic conversation", TalkToYourself.Run },
         { "Chat Session: Chinese characters", ChatChineseGB2312.Run },
         { "Executor: Interactive mode chat", InteractiveModeExecute.Run },
-        { "Executor: Llava Interactive mode chat", MtmdInteractiveModeExecute.Run },
+        { "Executor: Mtmd Interactive mode chat", MtmdInteractiveModeExecute.Run },
         { "Executor: Instruct mode chat", InstructModeExecute.Run },
         { "Executor: Stateless mode chat", StatelessModeExecute.Run },
         { "Save and Load: chat session", SaveAndLoadSession.Run },
diff --git a/LLama/Native/SafeMtmdModelHandle.cs b/LLama/Native/SafeMtmdModelHandle.cs
@@ -12,6 +12,7 @@ namespace LLama.Native
     /// </summary>
     public sealed class SafeMtmdModelHandle : SafeLLamaHandleBase
     {
+        // Pending media embeddings queued for the next call to Tokenize.
         private readonly List<SafeMtmdEmbed> _pendingMedia = new();
 
         /// <inheritdoc />
@@ -23,13 +24,14 @@ protected override bool ReleaseHandle()
         }
 
         /// <summary>
-        /// Load a model from the given file path into memory
+        /// Load a multimodal projection model from disk and bind it to the supplied text model.
         /// </summary>
-        /// <param name="modelPath">MMP File (Multi-Modal Projections)</param>
-        /// <param name="verbosity">Verbosity level</param>
-        /// <returns>SafeHandle of the Clip Model</returns>
-        /// <exception cref="InvalidOperationException"></exception>
-        /// <exception cref="LoadWeightsFailedException"></exception>
+        /// <param name="modelPath">Path to the MMP (Multi-Modal Projections) file.</param>
+        /// <param name="textModel">Text model that provides tokenizer weights for the multimodal helper.</param>
+        /// <param name="mtmdCtxParams">Optional context parameters; defaults are used when <c>null</c>.</param>
+        /// <returns>Safe handle for the MTMD model.</returns>
+        /// <exception cref="InvalidOperationException">The file exists but is not readable by the current process.</exception>
+        /// <exception cref="LoadWeightsFailedException">The native loader failed to initialize the MTMD model.</exception>
         public static SafeMtmdModelHandle LoadFromFile(string modelPath, LLamaWeights textModel, MtmdContextParams mtmdCtxParams)
         {
             // Try to open the model file, this will check:
@@ -64,29 +66,41 @@ public static SafeMtmdModelHandle LoadFromFile(string modelPath, LLamaWeights te
         }
 
         /// <summary>
-        /// Load media from disk and keep it pending for the next tokenize call.
+        /// Load media from disk and queue it for the next tokenize call.
         /// </summary>
+        /// <param name="path">Absolute or relative path to the media asset.</param>
+        /// <returns>Safe handle to the media embedding.</returns>
+        /// <exception cref="ObjectDisposedException">The model handle has been disposed.</exception>
+        /// <exception cref="RuntimeError">The native loader failed to ingest the file.</exception>
         public SafeMtmdEmbed LoadMediaFromFile(string path)
         {
+            EnsureNotDisposed();
+
             var embed = SafeMtmdEmbed.FromMediaFile(this, path)
                 ?? throw new RuntimeError($"Failed to load media '{path}'.");
             _pendingMedia.Add(embed);
             return embed;
         }
 
         /// <summary>
-        /// Load media from an in-memory buffer and keep it pending for the next tokenize call.
+        /// Load media from an in-memory buffer and queue it for the next tokenize call.
         /// </summary>
+        /// <param name="buffer">Binary buffer containing the encoded media data.</param>
+        /// <returns>Safe handle to the media embedding.</returns>
+        /// <exception cref="ObjectDisposedException">The model handle has been disposed.</exception>
+        /// <exception cref="RuntimeError">The native loader failed to ingest the buffer contents.</exception>
         public SafeMtmdEmbed LoadMediaFromBuffer(ReadOnlySpan<byte> buffer)
         {
+            EnsureNotDisposed();
+
             var embed = SafeMtmdEmbed.FromMediaBuffer(this, buffer)
                 ?? throw new RuntimeError("Failed to load media from buffer.");
             _pendingMedia.Add(embed);
             return embed;
         }
 
         /// <summary>
-        /// Clears any pending media buffers tracked for tokenization.
+        /// Disposes and clears any media buffers currently queued for tokenization.
         /// </summary>
         public void ClearMedia()
         {
@@ -98,15 +112,23 @@ public void ClearMedia()
         /// <summary>
         /// Tokenize a prompt alongside the pending media buffers. Pending media is cleared on success.
         /// </summary>
+        /// <param name="text">Prompt text to tokenize.</param>
+        /// <param name="addSpecial">Whether to append special tokens automatically.</param>
+        /// <param name="parseSpecial">Whether special tokens should be treated as user-provided text.</param>
+        /// <param name="chunks">Receives the native chunk collection when tokenization succeeds.</param>
+        /// <returns>Zero on success; otherwise the native mtmd tokenize error code.</returns>
+        /// <exception cref="ObjectDisposedException">The model handle has been disposed.</exception>
         public int Tokenize(string text, bool addSpecial, bool parseSpecial, out SafeMtmdInputChunks? chunks)
         {
             EnsureNotDisposed();
 
             chunks = null;
+            // Allocate the chunk container before invoking the native tokenizer.
             var output = NativeApi.mtmd_input_chunks_init();
             if (output == IntPtr.Zero)
                 throw new RuntimeError("Failed to allocate mtmd_input_chunks.");
 
+            // Collect native pointers to the queued media embeddings.
             var bitmapHandles = new IntPtr[_pendingMedia.Count];
             for (var i = 0; i < _pendingMedia.Count; i++)
                 bitmapHandles[i] = _pendingMedia[i].NativePtr;
@@ -138,6 +160,14 @@ public int Tokenize(string text, bool addSpecial, bool parseSpecial, out SafeMtm
         /// <summary>
         /// Evaluate a batch of chunks using the helper (mirrors mtmd-helper eval logic).
         /// </summary>
+        /// <param name="chunks">Chunk collection produced by <see cref="Tokenize"/>.</param>
+        /// <param name="llamaContext">Context handle that receives the evaluated tokens.</param>
+        /// <param name="nPast">Number of past tokens; updated when evaluation succeeds.</param>
+        /// <param name="seqId">Sequence identifier used for KV cache management.</param>
+        /// <param name="nBatch">Maximum number of tokens to evaluate in a single batch.</param>
+        /// <param name="logitsLast">Whether to request logits for the last token only.</param>
+        /// <returns>Zero on success; otherwise the native helper error code.</returns>
+        /// <exception cref="ArgumentNullException">Thrown when required handles are null.</exception>
         public int EvaluateChunks(SafeMtmdInputChunks chunks, SafeLLamaContextHandle llamaContext, ref long nPast, int seqId, int nBatch, bool logitsLast)
         {
             EnsureNotDisposed();
@@ -166,6 +196,14 @@ public int EvaluateChunks(SafeMtmdInputChunks chunks, SafeLLamaContextHandle lla
         /// <summary>
         /// Evaluate a single chunk helper.
         /// </summary>
+        /// <param name="chunkPtr">Pointer to the chunk to evaluate.</param>
+        /// <param name="llamaContext">Context handle that receives the evaluated tokens.</param>
+        /// <param name="nPast">Number of past tokens; updated when evaluation succeeds.</param>
+        /// <param name="seqId">Sequence identifier used for KV cache management.</param>
+        /// <param name="nBatch">Maximum number of tokens to evaluate in a single batch.</param>
+        /// <param name="logitsLast">Whether to request logits for the last token only.</param>
+        /// <returns>Zero on success; otherwise the native helper error code.</returns>
+        /// <exception cref="ArgumentNullException">Thrown when required handles are null.</exception>
         public int EvaluateChunk(IntPtr chunkPtr, SafeLLamaContextHandle llamaContext, ref long nPast, int seqId, int nBatch, bool logitsLast)
         {
             EnsureNotDisposed();
@@ -194,10 +232,21 @@ public int EvaluateChunk(IntPtr chunkPtr, SafeLLamaContextHandle llamaContext, r
         /// <summary>
         /// Decode a prepared image chunk whose embedding is already computed.
         /// </summary>
+        /// <param name="chunkPtr">Pointer to the chunk whose embedding should be decoded.</param>
+        /// <param name="llamaContext">Context handle used for decoding.</param>
+        /// <param name="encodedEmbeddings">Pointer to the pre-computed embedding data.</param>
+        /// <param name="nPast">Number of past tokens; updated when evaluation succeeds.</param>
+        /// <param name="seqId">Sequence identifier used for KV cache management.</param>
+        /// <param name="nBatch">Maximum number of tokens to evaluate in a single batch.</param>
+        /// <returns>Zero on success; otherwise the native helper error code.</returns>
+        /// <exception cref="ArgumentNullException">Thrown when required handles are null.</exception>
         public int DecodeImageChunk(IntPtr chunkPtr, SafeLLamaContextHandle llamaContext, IntPtr encodedEmbeddings, ref long nPast, int seqId, int nBatch)
         {
             EnsureNotDisposed();
 
+            if (chunkPtr == IntPtr.Zero)
+                throw new ArgumentNullException(nameof(chunkPtr));
+
             var result = NativeApi.mtmd_helper_decode_image_chunk(
                 DangerousGetHandle(),
                 llamaContext?.DangerousGetHandle() ?? throw new ArgumentNullException(nameof(llamaContext)),
@@ -214,13 +263,23 @@ public int DecodeImageChunk(IntPtr chunkPtr, SafeLLamaContextHandle llamaContext
             return result;
         }
 
+        /// <summary>
+        /// Get the number of tokens contained in the provided chunk collection.
+        /// </summary>
+        /// <param name="chunks">Chunk collection produced by <see cref="Tokenize"/>.</param>
+        /// <returns>Total token count.</returns>
         public ulong CountTokens(SafeMtmdInputChunks chunks)
         {
             if (chunks == null)
                 throw new ArgumentNullException(nameof(chunks));
             return NativeApi.mtmd_helper_get_n_tokens(chunks.NativePtr).ToUInt64();
         }
 
+        /// <summary>
+        /// Get the number of positions contained in the provided chunk collection.
+        /// </summary>
+        /// <param name="chunks">Chunk collection produced by <see cref="Tokenize"/>.</param>
+        /// <returns>Total number of positional slots consumed.</returns>
         public long CountPositions(SafeMtmdInputChunks chunks)
         {
             if (chunks == null)
@@ -231,7 +290,7 @@ public long CountPositions(SafeMtmdInputChunks chunks)
         #region native API
         
         // mtmd_init_from_file(const char * mmproj_fname, const struct llama_model * text_model, const struct mtmd_context_params ctx_params);
-        // We don't know llama_model layout. Accept IntPtr for text_model.
+        // The llama_model layout is opaque; expose it via SafeLlamaModelHandle to match the managed wrapper.
         [DllImport(NativeApi.mtmdLibraryName, EntryPoint = "mtmd_init_from_file", CallingConvention = CallingConvention.Cdecl)]
         private static extern unsafe SafeMtmdModelHandle mtmd_init_from_file(
             byte* mmproj_fname,
@@ -245,15 +304,37 @@ private static extern unsafe SafeMtmdModelHandle mtmd_init_from_file(
         
         
         
+        /// <summary>
+        /// Finalizer to ensure native resources are released if Dispose was not called.
+        /// </summary>
         ~SafeMtmdModelHandle()
         {
             Dispose();
         }
 
+        /// <summary>
+        /// Indicates whether the model decodes using the non-causal path.
+        /// </summary>
         public bool DecodeUseNonCausal() => NativeApi.mtmd_decode_use_non_causal(handle);
+
+        /// <summary>
+        /// Indicates whether the model decodes using multi-scale RoPE.
+        /// </summary>
         public bool DecodeUseMRope() => NativeApi.mtmd_decode_use_mrope(handle);
+
+        /// <summary>
+        /// Indicates whether the model supports vision inputs.
+        /// </summary>
         public bool SupportVision() => NativeApi.mtmd_support_vision(handle);
+
+        /// <summary>
+        /// Indicates whether the model supports audio inputs.
+        /// </summary>
         public bool SupportAudio() => NativeApi.mtmd_support_audio(handle);
+
+        /// <summary>
+        /// Gets the audio bitrate advertised by the model.
+        /// </summary>
         public int GetAudioBitrate() => NativeApi.mtmd_get_audio_bitrate(handle);
 
         private void EnsureNotDisposed()