Skip to content

Commit 78f6137

Browse files
committed
WIP
1 parent 4e4eaf9 commit 78f6137

File tree

2 files changed

+92
-11
lines changed

2 files changed

+92
-11
lines changed

LLama.Examples/ExampleRunner.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ public class ExampleRunner
1515
{ "Chat Session: Automatic conversation", TalkToYourself.Run },
1616
{ "Chat Session: Chinese characters", ChatChineseGB2312.Run },
1717
{ "Executor: Interactive mode chat", InteractiveModeExecute.Run },
18-
{ "Executor: Llava Interactive mode chat", MtmdInteractiveModeExecute.Run },
18+
{ "Executor: Mtmd Interactive mode chat", MtmdInteractiveModeExecute.Run },
1919
{ "Executor: Instruct mode chat", InstructModeExecute.Run },
2020
{ "Executor: Stateless mode chat", StatelessModeExecute.Run },
2121
{ "Save and Load: chat session", SaveAndLoadSession.Run },

LLama/Native/SafeMtmdModelHandle.cs

Lines changed: 91 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ namespace LLama.Native
1212
/// </summary>
1313
public sealed class SafeMtmdModelHandle : SafeLLamaHandleBase
1414
{
15+
// Pending media embeddings queued for the next call to Tokenize.
1516
private readonly List<SafeMtmdEmbed> _pendingMedia = new();
1617

1718
/// <inheritdoc />
@@ -23,13 +24,14 @@ protected override bool ReleaseHandle()
2324
}
2425

2526
/// <summary>
26-
/// Load a model from the given file path into memory
27+
/// Load a multimodal projection model from disk and bind it to the supplied text model.
2728
/// </summary>
28-
/// <param name="modelPath">MMP File (Multi-Modal Projections)</param>
29-
/// <param name="verbosity">Verbosity level</param>
30-
/// <returns>SafeHandle of the Clip Model</returns>
31-
/// <exception cref="InvalidOperationException"></exception>
32-
/// <exception cref="LoadWeightsFailedException"></exception>
29+
/// <param name="modelPath">Path to the MMP (Multi-Modal Projections) file.</param>
30+
/// <param name="textModel">Text model that provides tokenizer weights for the multimodal helper.</param>
31+
/// <param name="mtmdCtxParams">Optional context parameters; defaults are used when <c>null</c>.</param>
32+
/// <returns>Safe handle for the MTMD model.</returns>
33+
/// <exception cref="InvalidOperationException">The file exists but is not readable by the current process.</exception>
34+
/// <exception cref="LoadWeightsFailedException">The native loader failed to initialize the MTMD model.</exception>
3335
public static SafeMtmdModelHandle LoadFromFile(string modelPath, LLamaWeights textModel, MtmdContextParams mtmdCtxParams)
3436
{
3537
// Try to open the model file, this will check:
@@ -64,29 +66,41 @@ public static SafeMtmdModelHandle LoadFromFile(string modelPath, LLamaWeights te
6466
}
6567

6668
/// <summary>
67-
/// Load media from disk and keep it pending for the next tokenize call.
69+
/// Load media from disk and queue it for the next tokenize call.
6870
/// </summary>
71+
/// <param name="path">Absolute or relative path to the media asset.</param>
72+
/// <returns>Safe handle to the media embedding.</returns>
73+
/// <exception cref="ObjectDisposedException">The model handle has been disposed.</exception>
74+
/// <exception cref="RuntimeError">The native loader failed to ingest the file.</exception>
6975
public SafeMtmdEmbed LoadMediaFromFile(string path)
7076
{
77+
EnsureNotDisposed();
78+
7179
var embed = SafeMtmdEmbed.FromMediaFile(this, path)
7280
?? throw new RuntimeError($"Failed to load media '{path}'.");
7381
_pendingMedia.Add(embed);
7482
return embed;
7583
}
7684

7785
/// <summary>
78-
/// Load media from an in-memory buffer and keep it pending for the next tokenize call.
86+
/// Load media from an in-memory buffer and queue it for the next tokenize call.
7987
/// </summary>
88+
/// <param name="buffer">Binary buffer containing the encoded media data.</param>
89+
/// <returns>Safe handle to the media embedding.</returns>
90+
/// <exception cref="ObjectDisposedException">The model handle has been disposed.</exception>
91+
/// <exception cref="RuntimeError">The native loader failed to ingest the buffer contents.</exception>
8092
public SafeMtmdEmbed LoadMediaFromBuffer(ReadOnlySpan<byte> buffer)
8193
{
94+
EnsureNotDisposed();
95+
8296
var embed = SafeMtmdEmbed.FromMediaBuffer(this, buffer)
8397
?? throw new RuntimeError("Failed to load media from buffer.");
8498
_pendingMedia.Add(embed);
8599
return embed;
86100
}
87101

88102
/// <summary>
89-
/// Clears any pending media buffers tracked for tokenization.
103+
/// Disposes and clears any media buffers currently queued for tokenization.
90104
/// </summary>
91105
public void ClearMedia()
92106
{
@@ -98,15 +112,23 @@ public void ClearMedia()
98112
/// <summary>
99113
/// Tokenize a prompt alongside the pending media buffers. Pending media is cleared on success.
100114
/// </summary>
115+
/// <param name="text">Prompt text to tokenize.</param>
116+
/// <param name="addSpecial">Whether to append special tokens automatically.</param>
117+
/// <param name="parseSpecial">Whether special tokens should be treated as user-provided text.</param>
118+
/// <param name="chunks">Receives the native chunk collection when tokenization succeeds.</param>
119+
/// <returns>Zero on success; otherwise the native mtmd tokenize error code.</returns>
120+
/// <exception cref="ObjectDisposedException">The model handle has been disposed.</exception>
101121
public int Tokenize(string text, bool addSpecial, bool parseSpecial, out SafeMtmdInputChunks? chunks)
102122
{
103123
EnsureNotDisposed();
104124

105125
chunks = null;
126+
// Allocate the chunk container before invoking the native tokenizer.
106127
var output = NativeApi.mtmd_input_chunks_init();
107128
if (output == IntPtr.Zero)
108129
throw new RuntimeError("Failed to allocate mtmd_input_chunks.");
109130

131+
// Collect native pointers to the queued media embeddings.
110132
var bitmapHandles = new IntPtr[_pendingMedia.Count];
111133
for (var i = 0; i < _pendingMedia.Count; i++)
112134
bitmapHandles[i] = _pendingMedia[i].NativePtr;
@@ -138,6 +160,14 @@ public int Tokenize(string text, bool addSpecial, bool parseSpecial, out SafeMtm
138160
/// <summary>
139161
/// Evaluate a batch of chunks using the helper (mirrors mtmd-helper eval logic).
140162
/// </summary>
163+
/// <param name="chunks">Chunk collection produced by <see cref="Tokenize"/>.</param>
164+
/// <param name="llamaContext">Context handle that receives the evaluated tokens.</param>
165+
/// <param name="nPast">Number of past tokens; updated when evaluation succeeds.</param>
166+
/// <param name="seqId">Sequence identifier used for KV cache management.</param>
167+
/// <param name="nBatch">Maximum number of tokens to evaluate in a single batch.</param>
168+
/// <param name="logitsLast">Whether to request logits for the last token only.</param>
169+
/// <returns>Zero on success; otherwise the native helper error code.</returns>
170+
/// <exception cref="ArgumentNullException">Thrown when required handles are null.</exception>
141171
public int EvaluateChunks(SafeMtmdInputChunks chunks, SafeLLamaContextHandle llamaContext, ref long nPast, int seqId, int nBatch, bool logitsLast)
142172
{
143173
EnsureNotDisposed();
@@ -166,6 +196,14 @@ public int EvaluateChunks(SafeMtmdInputChunks chunks, SafeLLamaContextHandle lla
166196
/// <summary>
167197
/// Evaluate a single chunk helper.
168198
/// </summary>
199+
/// <param name="chunkPtr">Pointer to the chunk to evaluate.</param>
200+
/// <param name="llamaContext">Context handle that receives the evaluated tokens.</param>
201+
/// <param name="nPast">Number of past tokens; updated when evaluation succeeds.</param>
202+
/// <param name="seqId">Sequence identifier used for KV cache management.</param>
203+
/// <param name="nBatch">Maximum number of tokens to evaluate in a single batch.</param>
204+
/// <param name="logitsLast">Whether to request logits for the last token only.</param>
205+
/// <returns>Zero on success; otherwise the native helper error code.</returns>
206+
/// <exception cref="ArgumentNullException">Thrown when required handles are null.</exception>
169207
public int EvaluateChunk(IntPtr chunkPtr, SafeLLamaContextHandle llamaContext, ref long nPast, int seqId, int nBatch, bool logitsLast)
170208
{
171209
EnsureNotDisposed();
@@ -194,10 +232,21 @@ public int EvaluateChunk(IntPtr chunkPtr, SafeLLamaContextHandle llamaContext, r
194232
/// <summary>
195233
/// Decode a prepared image chunk whose embedding is already computed.
196234
/// </summary>
235+
/// <param name="chunkPtr">Pointer to the chunk whose embedding should be decoded.</param>
236+
/// <param name="llamaContext">Context handle used for decoding.</param>
237+
/// <param name="encodedEmbeddings">Pointer to the pre-computed embedding data.</param>
238+
/// <param name="nPast">Number of past tokens; updated when evaluation succeeds.</param>
239+
/// <param name="seqId">Sequence identifier used for KV cache management.</param>
240+
/// <param name="nBatch">Maximum number of tokens to evaluate in a single batch.</param>
241+
/// <returns>Zero on success; otherwise the native helper error code.</returns>
242+
/// <exception cref="ArgumentNullException">Thrown when required handles are null.</exception>
197243
public int DecodeImageChunk(IntPtr chunkPtr, SafeLLamaContextHandle llamaContext, IntPtr encodedEmbeddings, ref long nPast, int seqId, int nBatch)
198244
{
199245
EnsureNotDisposed();
200246

247+
if (chunkPtr == IntPtr.Zero)
248+
throw new ArgumentNullException(nameof(chunkPtr));
249+
201250
var result = NativeApi.mtmd_helper_decode_image_chunk(
202251
DangerousGetHandle(),
203252
llamaContext?.DangerousGetHandle() ?? throw new ArgumentNullException(nameof(llamaContext)),
@@ -214,13 +263,23 @@ public int DecodeImageChunk(IntPtr chunkPtr, SafeLLamaContextHandle llamaContext
214263
return result;
215264
}
216265

266+
/// <summary>
267+
/// Get the number of tokens contained in the provided chunk collection.
268+
/// </summary>
269+
/// <param name="chunks">Chunk collection produced by <see cref="Tokenize"/>.</param>
270+
/// <returns>Total token count.</returns>
217271
public ulong CountTokens(SafeMtmdInputChunks chunks)
218272
{
219273
if (chunks == null)
220274
throw new ArgumentNullException(nameof(chunks));
221275
return NativeApi.mtmd_helper_get_n_tokens(chunks.NativePtr).ToUInt64();
222276
}
223277

278+
/// <summary>
279+
/// Get the number of positions contained in the provided chunk collection.
280+
/// </summary>
281+
/// <param name="chunks">Chunk collection produced by <see cref="Tokenize"/>.</param>
282+
/// <returns>Total number of positional slots consumed.</returns>
224283
public long CountPositions(SafeMtmdInputChunks chunks)
225284
{
226285
if (chunks == null)
@@ -231,7 +290,7 @@ public long CountPositions(SafeMtmdInputChunks chunks)
231290
#region native API
232291

233292
// mtmd_init_from_file(const char * mmproj_fname, const struct llama_model * text_model, const struct mtmd_context_params ctx_params);
234-
// We don't know llama_model layout. Accept IntPtr for text_model.
293+
// The llama_model layout is opaque; expose it via SafeLlamaModelHandle to match the managed wrapper.
235294
[DllImport(NativeApi.mtmdLibraryName, EntryPoint = "mtmd_init_from_file", CallingConvention = CallingConvention.Cdecl)]
236295
private static extern unsafe SafeMtmdModelHandle mtmd_init_from_file(
237296
byte* mmproj_fname,
@@ -245,15 +304,37 @@ private static extern unsafe SafeMtmdModelHandle mtmd_init_from_file(
245304

246305

247306

307+
/// <summary>
308+
/// Finalizer to ensure native resources are released if Dispose was not called.
309+
/// </summary>
248310
~SafeMtmdModelHandle()
249311
{
250312
Dispose();
251313
}
252314

315+
/// <summary>
316+
/// Indicates whether the model decodes using the non-causal path.
317+
/// </summary>
253318
public bool DecodeUseNonCausal() => NativeApi.mtmd_decode_use_non_causal(handle);
319+
320+
/// <summary>
321+
/// Indicates whether the model decodes using multi-scale RoPE.
322+
/// </summary>
254323
public bool DecodeUseMRope() => NativeApi.mtmd_decode_use_mrope(handle);
324+
325+
/// <summary>
326+
/// Indicates whether the model supports vision inputs.
327+
/// </summary>
255328
public bool SupportVision() => NativeApi.mtmd_support_vision(handle);
329+
330+
/// <summary>
331+
/// Indicates whether the model supports audio inputs.
332+
/// </summary>
256333
public bool SupportAudio() => NativeApi.mtmd_support_audio(handle);
334+
335+
/// <summary>
336+
/// Gets the audio bitrate advertised by the model.
337+
/// </summary>
257338
public int GetAudioBitrate() => NativeApi.mtmd_get_audio_bitrate(handle);
258339

259340
private void EnsureNotDisposed()

0 commit comments

Comments
 (0)