@@ -106,10 +106,10 @@ static uint64_t hash_vector_float(const std::vector<float> & vec) {
106106 return seed;
107107}
108108
109- mtmd_input_chunks * mtmd_tokenize (mtmd_context * ctx,
110- const mtmd_input_text & text ,
111- const std::vector<mtmd_bitmap> & bitmaps) {
112- mtmd_input_chunks * output = new mtmd_input_chunks;
109+ int32_t mtmd_tokenize (mtmd_context * ctx,
110+ std::vector<mtmd_input_chunk> & output ,
111+ const mtmd_input_text & text,
112+ const std::vector<mtmd_bitmap> & bitmaps) {
113113 auto vocab = llama_model_get_vocab (ctx->text_model );
114114
115115 std::string prompt_modified (text.text );
@@ -124,8 +124,8 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
124124 }
125125
126126 std::vector<std::string> parts = string_split_str (text.text , ctx->image_marker );
127- output-> clear ();
128- output-> reserve (parts.size ());
127+ output. clear ();
128+ output. reserve (parts.size ());
129129
130130 size_t i_img = 0 ;
131131
@@ -141,14 +141,14 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
141141 std::move (tokens),
142142 {},
143143 };
144- output-> emplace_back (std::move (chunk));
144+ output. emplace_back (std::move (chunk));
145145
146146 if (&parts.back () != &part) {
147147 // add image token to middle of 2 parts
148148
149149 if (i_img >= bitmaps.size ()) {
150150 LOG_ERR (" %s: error: not enough images for %d parts\n " , __func__, (int )parts.size ());
151- return nullptr ;
151+ return 1 ;
152152 }
153153
154154 // shim layer
@@ -163,10 +163,10 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
163163 bool ok = clip_image_preprocess (ctx->ctx_clip , img_u8.get (), &batch_f32);
164164 if (!ok) {
165165 LOG_ERR (" Unable to preprocess image\n " );
166- return nullptr ;
166+ return 2 ;
167167 }
168168
169- mtmd_image_tokens * image_tokens = new mtmd_image_tokens;
169+ mtmd_image_tokens_ptr image_tokens ( new mtmd_image_tokens) ;
170170 image_tokens->nx = clip_n_patches (ctx->ctx_clip ); // TODO @ngxson : use clip_n_patches_by_image
171171 image_tokens->ny = 1 ; // TODO
172172 image_tokens->batch_f32 = std::move (batch_f32);
@@ -179,14 +179,14 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
179179 mtmd_input_chunk chunk{
180180 MTMD_INPUT_CHUNK_TYPE_IMAGE,
181181 {},
182- image_tokens,
182+ std::move ( image_tokens) ,
183183 };
184- output-> emplace_back (std::move (chunk));
184+ output. emplace_back (std::move (chunk));
185185 i_img++;
186186 }
187187 }
188188
189- return output ;
189+ return 0 ;
190190}
191191
192192void mtmd_image_tokens_free (mtmd_image_tokens * image_tokens) {
@@ -195,18 +195,6 @@ void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens) {
195195 }
196196}
197197
198- void mtmd_input_chunks_free (mtmd_input_chunks * chunks, bool free_images) {
199- if (free_images) {
200- for (auto & chunk : *chunks) {
201- if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE && chunk.tokens_image ) {
202- mtmd_image_tokens_free (chunk.tokens_image );
203- chunk.tokens_image = nullptr ;
204- }
205- }
206- }
207- delete chunks;
208- }
209-
210198size_t mtmd_image_tokens_get_n_tokens (const mtmd_image_tokens * image_tokens) {
211199 return image_tokens->n_tokens ();
212200}
@@ -238,9 +226,9 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
238226 return ctx->image_embd_v .data ();
239227}
240228
241- size_t mtmd_helper_get_n_tokens (mtmd_input_chunks * chunks) {
229+ size_t mtmd_helper_get_n_tokens (mtmd_input_chunks & chunks) {
242230 size_t n_tokens = 0 ;
243- for (auto & chunk : * chunks) {
231+ for (auto & chunk : chunks) {
244232 if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
245233 n_tokens += chunk.tokens_text .size ();
246234 } else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
@@ -289,16 +277,16 @@ struct decode_embd_batch {
289277
290278int32_t mtmd_helper_eval (mtmd_context * ctx,
291279 llama_context * lctx,
292- mtmd_input_chunks * chunks,
280+ mtmd_input_chunks & chunks,
293281 llama_pos pos0,
294282 llama_seq_id seq_id,
295283 int32_t n_batch) {
296284 int32_t ret;
297285 llama_pos n_past = pos0;
298286 llama_batch text_batch = llama_batch_init (n_batch, 0 , 1 );
299287
300- for (auto & chunk : * chunks) {
301- bool is_last = &chunk == &chunks-> back ();
288+ for (auto & chunk : chunks) {
289+ bool is_last = &chunk == &chunks. back ();
302290 if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
303291 // TODO @ngxson : may need to split into smaller batches
304292 text_batch.n_tokens = chunk.tokens_text .size ();
@@ -327,7 +315,7 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
327315 if (ctx->print_timings ) {
328316 LOG_INF (" encoding image...\n " );
329317 }
330- ret = mtmd_encode (ctx, chunk.tokens_image );
318+ ret = mtmd_encode (ctx, chunk.tokens_image . get () );
331319 if (ret != 0 ) {
332320 LOG_ERR (" failed to encode image\n " );
333321 llama_batch_free (text_batch);
@@ -337,7 +325,7 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
337325 LOG_INF (" image encoded in %" PRId64 " ms\n " , ggml_time_ms () - t0);
338326 }
339327
340- int32_t n_tokens = mtmd_image_tokens_get_n_tokens (chunk.tokens_image );
328+ int32_t n_tokens = mtmd_image_tokens_get_n_tokens (chunk.tokens_image . get () );
341329 float * embd = mtmd_get_output_embd (ctx);
342330 decode_embd_batch batch_img (embd, n_tokens, n_past, 0 );
343331 int64_t t1 = ggml_time_ms ();
@@ -395,3 +383,7 @@ bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
395383 }
396384 return false ;
397385}
386+
387+ void mtmd_image_tokens_deleter::operator ()(mtmd_image_tokens * val) {
388+ mtmd_image_tokens_free (val);
389+ }
0 commit comments