Sync llama/mtmd API change, support clip flash-attn

JamePeng · JamePeng · commit 6f8ec8becadd · 2025-11-03T22:00:53.000+08:00
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
@@ -2811,17 +2811,18 @@ def _init_mtmd_context(self, llama_model: llama.Llama):
 
         with suppress_stdout_stderr(disable=self.verbose):
             # Get default parameters
-            ctx_params = self._mtmd_cpp.mtmd_context_params_default()
-            ctx_params.use_gpu = True # TODO: Make this configurable
-            ctx_params.print_timings = self.verbose
-            ctx_params.n_threads = llama_model.n_threads
-            ctx_params.verbosity = 2 if self.verbose else 0  # GGML_LOG_LEVEL_INFO = 2
+            mctx_params = self._mtmd_cpp.mtmd_context_params_default()
+            mctx_params.use_gpu = True # TODO: Make this configurable
+            mctx_params.print_timings = self.verbose
+            mctx_params.n_threads = llama_model.n_threads
+            mctx_params.verbosity = 2 if self.verbose else 0  # GGML_LOG_LEVEL_INFO = 2
+            mctx_params.flash_attn_type  = self._mtmd_cpp.clip_flash_attn_type.CLIP_FLASH_ATTN_TYPE_AUTO
 
             # Initialize mtmd context
             self.mtmd_ctx = self._mtmd_cpp.mtmd_init_from_file(
                 self.clip_model_path.encode(),
                 llama_model.model,
-                ctx_params
+                mctx_params
             )
 
             if self.mtmd_ctx is None:
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
@@ -1401,12 +1401,18 @@ def llama_supports_gpu_offload() -> bool:
 def llama_supports_rpc() -> bool:
     ...
 
-
+# // NOTE: After creating a llama_context, it is recommended to query the actual values using these functions
+# //       In some cases the requested values via llama_context_params may differ from the actual values used by the context
 # LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
 @ctypes_function("llama_n_ctx", [llama_context_p_ctypes], ctypes.c_uint32)
 def llama_n_ctx(ctx: llama_context_p, /) -> int:
     ...
 
+# LLAMA_API uint32_t llama_n_ctx_seq  (const struct llama_context * ctx);
+@ctypes_function("llama_n_ctx_seq", [llama_context_p_ctypes], ctypes.c_uint32)
+def llama_n_ctx_seq(ctx: llama_context_p, /) -> int:
+    ...
+
 
 # LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
 @ctypes_function("llama_n_batch", [llama_context_p_ctypes], ctypes.c_uint32)
diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py
@@ -108,13 +108,44 @@ class mtmd_input_text(Structure):
 mtmd_input_text_p = NewType("mtmd_input_text_p", int)
 mtmd_input_text_p_ctypes = POINTER(mtmd_input_text)
 
+# enum clip_flash_attn_type {
+#     CLIP_FLASH_ATTN_TYPE_AUTO     = -1,
+#     CLIP_FLASH_ATTN_TYPE_DISABLED = 0,
+#     CLIP_FLASH_ATTN_TYPE_ENABLED  = 1,
+# };
+class clip_flash_attn_type (enum.IntEnum):
+    CLIP_FLASH_ATTN_TYPE_AUTO = -1
+    CLIP_FLASH_ATTN_TYPE_DISABLED = 0
+    CLIP_FLASH_ATTN_TYPE_ENABLED = 1
+
+# struct clip_context_params {
+#     bool use_gpu;
+#     enum ggml_log_level verbosity;
+#     enum clip_flash_attn_type flash_attn_type;
+#     int image_min_tokens;
+#     int image_max_tokens;
+# };
+class clip_context_params(Structure):
+    _fields_ = [
+        ("use_gpu", c_bool),
+        ("verbosity", c_int),
+        ("flash_attn_type", c_int),
+        ("image_min_tokens", c_int),
+        ("image_max_tokens", c_int),
+    ]
+
 # struct mtmd_context_params {
 #     bool use_gpu;
 #     bool print_timings;
 #     int n_threads;
 #     enum ggml_log_level verbosity;
 #     const char * image_marker; // deprecated, use media_marker instead
 #     const char * media_marker;
+#     enum llama_flash_attn_type flash_attn_type;
+
+#     // limit number of image tokens, only for vision models with dynamic resolution
+#     int image_min_tokens; // minimum number of tokens for image input (default: read from metadata)
+#     int image_max_tokens; // maximum number of tokens for image input (default: read from metadata)
 # };
 class mtmd_context_params(Structure):
     _fields_ = [
@@ -124,6 +155,9 @@ class mtmd_context_params(Structure):
         ("verbosity", c_int),
         ("image_marker", c_char_p),
         ("media_marker", c_char_p),
+        ("flash_attn_type", c_int),
+        ("image_min_tokens", c_int),
+        ("image_max_tokens", c_int),
     ]
 
 mtmd_context_params_p = NewType("mtmd_context_params_p", int)