migrate clip to mtmd

okaris · okaris · commit ffff841f12ff · 2025-07-04T12:04:37.000Z
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
@@ -2716,7 +2716,7 @@ class Llava15ChatHandler:
         "{% endif %}"
     )
 
-    def __init__(self, clip_model_path: str, verbose: bool = True):
+    def __init__(self, clip_model_path: str, llama_model: llama.Llama, verbose: bool = True):
         import llama_cpp.mtmd_cpp as mtmd_cpp
 
         self.clip_model_path = clip_model_path
@@ -3630,62 +3630,59 @@ def split_text_on_image_urls(text: str, image_urls: List[str]):
         return split_text
 
     def eval_image(self, llama: llama.Llama, image_url: str):
-        import llama_cpp
+        image_bytes = self.load_image(image_url)
+        
+        # Create bitmap manager if not exists
+        if self._bitmap_manager is None:
+            self._bitmap_manager = self._mtmd_cpp.BitmapManager()
+
+        # Create bitmap from bytes
+        if not self._bitmap_manager.add_from_memory(self.clip_ctx, image_bytes):
+            raise ValueError("Failed to create bitmap from image bytes")
+
+        # Create input chunks for the bitmap
+        chunks = self._mtmd_cpp.mtmd_input_chunks_init()
+        if chunks is None:
+            raise ValueError("Failed to create input chunks")
+
+        # Create input text with media marker
+        # Get media marker from context params
+        params = self._mtmd_cpp.mtmd_context_params_default()
+        text = self._mtmd_cpp.mtmd_input_text()
+        text.text = params.media_marker if params.media_marker else self._mtmd_cpp.mtmd_default_marker()
+        text.add_special = False
+        text.parse_special = True
+
+        # Tokenize with bitmap
+        if self._mtmd_cpp.mtmd_tokenize(self.clip_ctx, chunks, text, self._bitmap_manager.c_ptr(), len(self._bitmap_manager.entries)) != 0:
+            self._mtmd_cpp.mtmd_input_chunks_free(chunks)
+            raise ValueError("Failed to tokenize image")
+
+        # Get new n_past after evaluation
+        n_past = ctypes.c_int(llama.n_tokens)
+        n_past_p = ctypes.pointer(n_past)
 
-        n_tokens = 256
-        if llama.n_tokens + n_tokens > llama.n_ctx():
-            raise ValueError(
-                f"Prompt exceeds n_ctx: {llama.n_tokens + n_tokens} > {llama.n_ctx()}"
-            )
+        # Evaluate chunks
+        if self._mtmd_cpp.mtmd_helper_eval_chunks(
+            self.clip_ctx,
+            llama.ctx,
+            chunks,
+            llama.n_tokens,
+            0,  # seq_id
+            llama.n_batch,
+            True,  # logits_last
+            n_past_p
+        ) != 0:
+            self._mtmd_cpp.mtmd_input_chunks_free(chunks)
+            raise ValueError("Failed to evaluate chunks")
+
+        # Update n_tokens
+        llama.input_ids[llama.n_tokens : n_past.value] = -1
+        llama.n_tokens = n_past.value
 
-        img_bytes = self.load_image(image_url)
-        img_u8_p = self._mtmd_cpp.clip_image_u8_init()
-        if not self._mtmd_cpp.clip_image_load_from_bytes(
-            ctypes.create_string_buffer(img_bytes, len(img_bytes)),
-            ctypes.c_size_t(len(img_bytes)),
-            img_u8_p,
-        ):
-            self._mtmd_cpp.clip_image_u8_free(img_u8_p)
-            raise ValueError("Failed to load image.")
-
-        img_f32_p = self._mtmd_cpp.clip_image_f32_batch_init()
-        if not self._mtmd_cpp.clip_image_preprocess(self.clip_ctx, img_u8_p, img_f32_p):
-            self._mtmd_cpp.clip_image_f32_batch_free(img_f32_p)
-            self._mtmd_cpp.clip_image_u8_free(img_u8_p)
-            raise ValueError("Failed to preprocess image.")
-
-        n_embd = llama_cpp.llama_model_n_embd(llama._model.model)
-        embed = (ctypes.c_float * (n_tokens * n_embd))()
-        if not self._mtmd_cpp.clip_image_batch_encode(self.clip_ctx, llama.n_threads, img_f32_p, embed):
-            self._mtmd_cpp.clip_image_f32_batch_free(img_f32_p)
-            self._mtmd_cpp.clip_image_u8_free(img_u8_p)
-            raise ValueError("Failed to encode image.")
-
-        self._mtmd_cpp.clip_image_f32_batch_free(img_f32_p)
-        self._mtmd_cpp.clip_image_u8_free(img_u8_p)
-        llama_cpp.llama_set_causal_attn(llama.ctx, False)
-
-        seq_id_0 = (ctypes.c_int32 * 1)()
-        seq_ids = (ctypes.POINTER(ctypes.c_int32) * (n_tokens + 1))()
-        for i in range(n_tokens):
-            seq_ids[i] = seq_id_0
-
-        batch = llama_cpp.llama_batch()
-        batch.n_tokens = n_tokens
-        batch.token = None
-        batch.embd = embed
-        batch.pos = (ctypes.c_int32 * n_tokens)(*[i + llama.n_tokens for i in range(n_tokens)])
-        batch.seq_id = seq_ids
-        batch.n_seq_id = (ctypes.c_int32 * n_tokens)(*([1] * n_tokens))
-        batch.logits = (ctypes.c_int8 * n_tokens)()
-
-        if llama_cpp.llama_decode(llama.ctx, batch):
-            raise ValueError("Failed to decode image.")
-
-        llama_cpp.llama_set_causal_attn(llama.ctx, True)
-        # Required to avoid issues with hf tokenizer
-        llama.input_ids[llama.n_tokens : llama.n_tokens + n_tokens] = -1
-        llama.n_tokens += n_tokens
+        # Cleanup
+        self._mtmd_cpp.mtmd_input_chunks_free(chunks)
+        self._bitmap_manager.clear()
 
 
 def _accumulate_chunks(
diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py
@@ -1,10 +1,12 @@
 from __future__ import annotations
 
 import os
+import ctypes
 from ctypes import (
     c_bool,
     c_char_p,
     c_int,
+    c_int32,
     c_uint8,
     c_uint32,
     c_float,
@@ -17,6 +19,7 @@
 )
 import pathlib
 from typing import (
+    List,
     Union,
     NewType,
     Optional,
@@ -31,19 +34,161 @@
 )
 
 if TYPE_CHECKING:
+    from llama_cpp.llama_types import (
+        llama_token,
+        llama_pos,
+    )
     from llama_cpp._ctypes_extensions import (
         CtypesArray,
+        CtypesPointer,
     )
 
+# Define input text structure
+class mtmd_input_text(Structure):
+    _fields_ = [
+        ("text", c_char_p),
+        ("add_special", c_bool),
+        ("parse_special", c_bool),
+    ]
+
+# Define context parameters structure
+class mtmd_context_params(Structure):
+    _fields_ = [
+        ("use_gpu", c_bool),
+        ("print_timings", c_bool),
+        ("n_threads", c_int),
+        ("verbosity", c_int),
+        ("image_marker", c_char_p),  # const char*
+        ("media_marker", c_char_p),  # const char*
+    ]
+
+# Define input chunk type enum
+mtmd_input_chunk_type = c_int
+(
+    MTMD_INPUT_CHUNK_TYPE_TEXT,
+    MTMD_INPUT_CHUNK_TYPE_IMAGE,
+    MTMD_INPUT_CHUNK_TYPE_AUDIO,
+) = (0, 1, 2)
+
+# Define slice template enum
+mtmd_slice_tmpl = c_int
+(
+    MTMD_SLICE_TMPL_NONE,
+    MTMD_SLICE_TMPL_MINICPMV_2_5,
+    MTMD_SLICE_TMPL_MINICPMV_2_6,
+    MTMD_SLICE_TMPL_LLAMA4,
+) = (0, 1, 2, 3)
+
+# Define whisper filters structure
+class whisper_filters(Structure):
+    _fields_ = [
+        ("n_mel", c_int),
+    ]
+
+# Define mtmd_context structure
+class mtmd_context(Structure):
+    _fields_ = [
+        ("ctx_v", c_void_p),  # clip_ctx*
+        ("ctx_a", c_void_p),  # clip_ctx*
+        ("text_model", c_void_p),  # const llama_model*
+        ("image_embd_v", POINTER(c_float)),  # std::vector<float>
+        ("print_timings", c_bool),
+        ("n_threads", c_int),
+        ("media_marker", c_char_p),  # std::string
+        ("n_embd_text", c_int),
+        ("img_beg", c_char_p),  # std::string
+        ("img_end", c_char_p),  # std::string
+        ("aud_beg", c_char_p),  # std::string
+        ("aud_end", c_char_p),  # std::string
+        ("slice_tmpl", c_int),  # mtmd_slice_tmpl
+        ("tok_ov_img_start", llama_cpp.llama_token),
+        ("tok_ov_img_end", llama_cpp.llama_token),
+        ("tok_slices_start", llama_cpp.llama_token),
+        ("tok_slices_end", llama_cpp.llama_token),
+        ("tok_sli_img_start", llama_cpp.llama_token),
+        ("tok_sli_img_end", llama_cpp.llama_token),
+        ("tok_sli_img_mid", llama_cpp.llama_token),
+        ("tok_row_end", llama_cpp.llama_token),
+        ("tok_row_end_trail", c_bool),
+        ("ov_img_first", c_bool),
+        ("use_mrope", c_bool),
+        ("w_filters", whisper_filters),
+    ]
+
+# Define bitmap structure
+class mtmd_bitmap(Structure):
+    _fields_ = [
+        ("nx", c_uint32),
+        ("ny", c_uint32),
+        ("data", POINTER(c_uint8)),  # Vector represented as pointer
+        ("id", c_char_p),
+        ("is_audio", c_bool),
+    ]
+
+# Define image tokens structure
+class mtmd_image_tokens(Structure):
+    _fields_ = [
+        ("nx", c_uint32),
+        ("ny", c_uint32),
+        ("use_mrope_pos", c_bool),
+        ("batch_f32", c_void_p),  # clip_image_f32_batch
+        ("id", c_char_p),
+    ]
 
-# Specify the base name of the shared library to load
+# Define audio tokens structure
+class mtmd_audio_tokens(Structure):
+    _fields_ = [
+        ("n_tokens", c_uint32),
+        ("batch_f32", c_void_p),  # clip_image_f32_batch
+        ("id", c_char_p),
+    ]
+
+# Define input chunk structure
+class mtmd_input_chunk(Structure):
+    _fields_ = [
+        ("type", mtmd_input_chunk_type),
+        ("tokens_text", POINTER(llama_cpp.llama_token)),  # Vector represented as pointer
+        ("tokens_image", c_void_p),  # mtmd_image_tokens_ptr
+        ("tokens_audio", c_void_p),  # mtmd_audio_tokens_ptr
+    ]
+
+# Define input chunks structure
+class mtmd_input_chunks(Structure):
+    _fields_ = [
+        ("entries", POINTER(mtmd_input_chunk)),  # Vector represented as pointer
+    ]
+
+# Define context pointer type
+mtmd_context_p = NewType("mtmd_context_p", int)
+mtmd_context_p_ctypes = c_void_p
+
+# Define bitmap pointer type
+mtmd_bitmap_p = NewType("mtmd_bitmap_p", int)
+mtmd_bitmap_p_ctypes = c_void_p
+
+# Define input chunks pointer type
+mtmd_input_chunks_p = NewType("mtmd_input_chunks_p", int)
+mtmd_input_chunks_p_ctypes = c_void_p
+
+# Define input chunk pointer type
+mtmd_input_chunk_p = NewType("mtmd_input_chunk_p", int)
+mtmd_input_chunk_p_ctypes = c_void_p
+
+# Define image tokens pointer type
+mtmd_image_tokens_p = NewType("mtmd_image_tokens_p", int)
+mtmd_image_tokens_p_ctypes = c_void_p
+
+# Define audio tokens pointer type
+mtmd_audio_tokens_p = NewType("mtmd_audio_tokens_p", int)
+mtmd_audio_tokens_p_ctypes = c_void_p
+
+# Load the library
 _libmtmd_base_name = "mtmd"
 _libmtmd_override_path = os.environ.get("MTMD_CPP_LIB")
 _libmtmd_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" if _libmtmd_override_path is None else pathlib.Path()
 
 # Load the library
 _libmtmd = load_shared_library(_libmtmd_base_name, _libmtmd_base_path)
-
 ctypes_function = ctypes_function_for_shared_library(_libmtmd)
 
 ################################################
diff --git a/tests/monalisa.jpg b/tests/monalisa.jpg
diff --git a/tests/test_llava.py b/tests/test_llava.py
@@ -0,0 +1,80 @@
+import multiprocessing
+import ctypes
+
+from huggingface_hub import hf_hub_download
+
+import pytest
+
+import llama_cpp
+
+@pytest.fixture
+def mmproj_model_path():
+    repo_id = "second-state/Llava-v1.5-7B-GGUF"
+    filename = "llava-v1.5-7b-mmproj-model-f16.gguf"
+    model_path = hf_hub_download(repo_id, filename)
+    return model_path
+
+@pytest.fixture
+def llava_cpp_model_path():
+    repo_id = "second-state/Llava-v1.5-7B-GGUF"
+    filename = "llava-v1.5-7b-Q8_0.gguf"
+    model_path = hf_hub_download(repo_id, filename)
+    return model_path
+
+def test_real_llava(llava_cpp_model_path, mmproj_model_path):
+    print("initializing model")
+    model = llama_cpp.Llama(
+        llava_cpp_model_path,
+        n_ctx=2048,
+        n_batch=512,
+        n_threads=multiprocessing.cpu_count(),
+        n_threads_batch=multiprocessing.cpu_count(),
+        logits_all=False,
+        verbose=False,
+    )
+
+    # Initialize the LLaVA chat handler
+    from llama_cpp.llama_chat_format import Llava15ChatHandler
+    print("initializing chat handler")
+    chat_handler = Llava15ChatHandler(clip_model_path=mmproj_model_path, llama_model=model)
+
+    # Create a chat message with the image
+    print("creating chat message")
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": "./tests/monalisa.jpg"
+                },
+                {
+                    "type": "text",
+                    "text": "Do you know who drew this painting?"
+                }
+            ]
+        }
+    ]
+
+    # Generate response
+    print("generating response")
+    response = chat_handler(
+        llama=model,
+        messages=messages,
+        max_tokens=200,
+        temperature=0.2,
+        top_p=0.95,
+        stream=False
+    )
+
+    print("response", response)
+    # Check that we got a response
+    assert response is not None
+    assert "choices" in response
+    assert len(response["choices"]) > 0
+    assert "message" in response["choices"][0]
+    assert "content" in response["choices"][0]["message"]
+    
+    # The response should mention Leonardo da Vinci
+    content = response["choices"][0]["message"]["content"].lower()
+    assert "leonardo" in content and "vinci" in content  # Artist name should be in response