migrate clip to mtmd

okaris · okaris · commit 2d0808bd6cfd · 2025-09-06T07:13:02.000Z
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -478,6 +478,10 @@ def free_lora_adapter():
                     f"Using fallback chat format: {self.chat_format}", file=sys.stderr
                 )
 
+        if self.chat_handler is not None:
+            if isinstance(self.chat_handler, llama_chat_format.Llava15ChatHandler):
+                self.chat_handler.initialize_mtmd_context(self)
+                
         self._sampler = None
 
     @property
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
@@ -2722,9 +2722,7 @@ class Llava15ChatHandler:
         "{% endif %}"
     )
 
-    def __init__(self, clip_model_path: str, llama_model: llama.Llama, verbose: bool = True):
-        import llama_cpp.mtmd_cpp as mtmd_cpp
-
+    def __init__(self, clip_model_path: str, llama_model: Optional[llama.Llama] = None, verbose: bool = True):
         self.clip_model_path = clip_model_path
         self.verbose = verbose
         self._mtmd_cpp = mtmd_cpp
@@ -2769,6 +2767,15 @@ def mtmd_free():
 
             self._exit_stack.callback(mtmd_free)
 
+    def __call__(self, *args, **kwargs):
+        if self.clip_ctx is None:
+            # Initialize MTMD context with the llama model from the first argument
+            if len(args) > 0 and isinstance(args[0], llama.Llama):
+                self.initialize_mtmd_context(args[0])
+            else:
+                raise ValueError("MTMD context not initialized. Please call initialize_mtmd_context with a llama model first.")
+        return super().__call__(*args, **kwargs)
+
     def load_image(self, image_url: str) -> bytes:
         return self._load_image(image_url)
 
@@ -3638,61 +3645,6 @@ def split_text_on_image_urls(text: str, image_urls: List[str]):
                 remaining = ""
         return split_text
 
-    def eval_image(self, llama: llama.Llama, image_url: str):
-        image_bytes = self.load_image(image_url)
-        
-        # Create bitmap manager if not exists
-        if self._bitmap_manager is None:
-            self._bitmap_manager = self._mtmd_cpp.BitmapManager()
-
-        # Create bitmap from bytes
-        if not self._bitmap_manager.add_from_memory(self.clip_ctx, image_bytes):
-            raise ValueError("Failed to create bitmap from image bytes")
-
-        # Create input chunks for the bitmap
-        chunks = self._mtmd_cpp.mtmd_input_chunks_init()
-        if chunks is None:
-            raise ValueError("Failed to create input chunks")
-
-        # Create input text with media marker
-        # Get media marker from context params
-        params = self._mtmd_cpp.mtmd_context_params_default()
-        text = self._mtmd_cpp.mtmd_input_text()
-        text.text = params.media_marker if params.media_marker else self._mtmd_cpp.mtmd_default_marker()
-        text.add_special = False
-        text.parse_special = True
-
-        # Tokenize with bitmap
-        if self._mtmd_cpp.mtmd_tokenize(self.clip_ctx, chunks, text, self._bitmap_manager.c_ptr(), len(self._bitmap_manager.entries)) != 0:
-            self._mtmd_cpp.mtmd_input_chunks_free(chunks)
-            raise ValueError("Failed to tokenize image")
-
-        # Get new n_past after evaluation
-        n_past = ctypes.c_int(llama.n_tokens)
-        n_past_p = ctypes.pointer(n_past)
-
-        # Evaluate chunks
-        if self._mtmd_cpp.mtmd_helper_eval_chunks(
-            self.clip_ctx,
-            llama.ctx,
-            chunks,
-            llama.n_tokens,
-            0,  # seq_id
-            llama.n_batch,
-            True,  # logits_last
-            n_past_p
-        ) != 0:
-            self._mtmd_cpp.mtmd_input_chunks_free(chunks)
-            raise ValueError("Failed to evaluate chunks")
-
-        # Update n_tokens
-        llama.input_ids[llama.n_tokens : n_past.value] = -1
-        llama.n_tokens = n_past.value
-
-        # Cleanup
-        self._mtmd_cpp.mtmd_input_chunks_free(chunks)
-        self._bitmap_manager.clear()
-
 
 def _accumulate_chunks(
     chunks_iterator: Iterator[llama_types.CreateCompletionStreamResponse],

Original file line number	Diff line number	Diff line change
`@@ -478,6 +478,10 @@ def free_lora_adapter():`
`478`	`478`	`f"Using fallback chat format: {self.chat_format}", file=sys.stderr`
`479`	`479`	`)`
`480`	`480`
	`481`	`+ if self.chat_handler is not None:`
	`482`	`+ if isinstance(self.chat_handler, llama_chat_format.Llava15ChatHandler):`
	`483`	`+ self.chat_handler.initialize_mtmd_context(self)`
	`484`	`+`
`481`	`485`	`self._sampler = None`
`482`	`486`
`483`	`487`	`@property`