migrate llava to mtmd

okaris · okaris · commit 11f768e3050e · 2025-09-06T07:18:27.000Z
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.21)
 project(llama_cpp)
 
 option(LLAMA_BUILD "Build llama.cpp shared library and install alongside python package" ON)
-option(LLAVA_BUILD "Build llava shared library and install alongside python package" ON)
+option(MTMD_BUILD "Build multimodal (mtmd) shared library and install alongside python package" ON)
 
 function(llama_cpp_python_install_target target)
     if(NOT TARGET ${target})
@@ -143,7 +143,7 @@ if (LLAMA_BUILD)
         )
     endif()
 
-    if (LLAVA_BUILD)
+    if (MTMD_BUILD)
         if (LLAMA_CUBLAS OR LLAMA_CUDA)
             add_compile_definitions(GGML_USE_CUBLAS)
             add_compile_definitions(GGML_USE_CUDA)
@@ -153,7 +153,7 @@ if (LLAMA_BUILD)
             add_compile_definitions(GGML_USE_METAL)
         endif()
 
-        # Building llava
+        # Building multimodal support using mtmd
         add_subdirectory(vendor/llama.cpp/tools/mtmd)
 
         if (WIN32)
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
@@ -3063,7 +3063,7 @@ def eval_image(self, llama: llama.Llama, image_url: str):
         n_past = ctypes.c_int(llama.n_tokens)
         n_past_p = ctypes.pointer(n_past)
         with suppress_stdout_stderr(disable=self.verbose):
-            self._llava_cpp.llava_eval_image_embed(
+            self._mtmd_cpp.mtmd_cpp_eval_image_embed(
                 llama.ctx,
                 embed,
                 llama.n_batch,
@@ -3648,30 +3648,30 @@ def eval_image(self, llama: llama.Llama, image_url: str):
             )
 
         img_bytes = self.load_image(image_url)
-        img_u8_p = self._llava_cpp.clip_image_u8_init()
-        if not self._llava_cpp.clip_image_load_from_bytes(
+        img_u8_p = self._mtmd_cpp.clip_image_u8_init()
+        if not self._mtmd_cpp.clip_image_load_from_bytes(
             ctypes.create_string_buffer(img_bytes, len(img_bytes)),
             ctypes.c_size_t(len(img_bytes)),
             img_u8_p,
         ):
-            self._llava_cpp.clip_image_u8_free(img_u8_p)
+            self._mtmd_cpp.clip_image_u8_free(img_u8_p)
             raise ValueError("Failed to load image.")
 
-        img_f32_p = self._llava_cpp.clip_image_f32_batch_init()
-        if not self._llava_cpp.clip_image_preprocess(self.clip_ctx, img_u8_p, img_f32_p):
-            self._llava_cpp.clip_image_f32_batch_free(img_f32_p)
-            self._llava_cpp.clip_image_u8_free(img_u8_p)
+        img_f32_p = self._mtmd_cpp.clip_image_f32_batch_init()
+        if not self._mtmd_cpp.clip_image_preprocess(self.clip_ctx, img_u8_p, img_f32_p):
+            self._mtmd_cpp.clip_image_f32_batch_free(img_f32_p)
+            self._mtmd_cpp.clip_image_u8_free(img_u8_p)
             raise ValueError("Failed to preprocess image.")
 
         n_embd = llama_cpp.llama_model_n_embd(llama._model.model)
         embed = (ctypes.c_float * (n_tokens * n_embd))()
-        if not self._llava_cpp.clip_image_batch_encode(self.clip_ctx, llama.n_threads, img_f32_p, embed):
-            self._llava_cpp.clip_image_f32_batch_free(img_f32_p)
-            self._llava_cpp.clip_image_u8_free(img_u8_p)
+        if not self._mtmd_cpp.clip_image_batch_encode(self.clip_ctx, llama.n_threads, img_f32_p, embed):
+            self._mtmd_cpp.clip_image_f32_batch_free(img_f32_p)
+            self._mtmd_cpp.clip_image_u8_free(img_u8_p)
             raise ValueError("Failed to encode image.")
 
-        self._llava_cpp.clip_image_f32_batch_free(img_f32_p)
-        self._llava_cpp.clip_image_u8_free(img_u8_p)
+        self._mtmd_cpp.clip_image_f32_batch_free(img_f32_p)
+        self._mtmd_cpp.clip_image_u8_free(img_u8_p)
         llama_cpp.llama_set_causal_attn(llama.ctx, False)
 
         seq_id_0 = (ctypes.c_int32 * 1)()
diff --git a/llama_cpp/llava_cpp.py b/llama_cpp/llava_cpp.py