@@ -108,13 +108,44 @@ class mtmd_input_text(Structure):
108108mtmd_input_text_p = NewType ("mtmd_input_text_p" , int )
109109mtmd_input_text_p_ctypes = POINTER (mtmd_input_text )
110110
111+ # enum clip_flash_attn_type {
112+ # CLIP_FLASH_ATTN_TYPE_AUTO = -1,
113+ # CLIP_FLASH_ATTN_TYPE_DISABLED = 0,
114+ # CLIP_FLASH_ATTN_TYPE_ENABLED = 1,
115+ # };
116+ class clip_flash_attn_type (enum .IntEnum ):
117+ CLIP_FLASH_ATTN_TYPE_AUTO = - 1
118+ CLIP_FLASH_ATTN_TYPE_DISABLED = 0
119+ CLIP_FLASH_ATTN_TYPE_ENABLED = 1
120+
121+ # struct clip_context_params {
122+ # bool use_gpu;
123+ # enum ggml_log_level verbosity;
124+ # enum clip_flash_attn_type flash_attn_type;
125+ # int image_min_tokens;
126+ # int image_max_tokens;
127+ # };
128+ class clip_context_params (Structure ):
129+ _fields_ = [
130+ ("use_gpu" , c_bool ),
131+ ("verbosity" , c_int ),
132+ ("flash_attn_type" , c_int ),
133+ ("image_min_tokens" , c_int ),
134+ ("image_max_tokens" , c_int ),
135+ ]
136+
111137# struct mtmd_context_params {
112138# bool use_gpu;
113139# bool print_timings;
114140# int n_threads;
115141# enum ggml_log_level verbosity;
116142# const char * image_marker; // deprecated, use media_marker instead
117143# const char * media_marker;
144+ # enum llama_flash_attn_type flash_attn_type;
145+
146+ # // limit number of image tokens, only for vision models with dynamic resolution
147+ # int image_min_tokens; // minimum number of tokens for image input (default: read from metadata)
148+ # int image_max_tokens; // maximum number of tokens for image input (default: read from metadata)
118149# };
119150class mtmd_context_params (Structure ):
120151 _fields_ = [
@@ -124,6 +155,9 @@ class mtmd_context_params(Structure):
124155 ("verbosity" , c_int ),
125156 ("image_marker" , c_char_p ),
126157 ("media_marker" , c_char_p ),
158+ ("flash_attn_type" , c_int ),
159+ ("image_min_tokens" , c_int ),
160+ ("image_max_tokens" , c_int ),
127161 ]
128162
129163mtmd_context_params_p = NewType ("mtmd_context_params_p" , int )
0 commit comments