@@ -16,15 +16,22 @@ struct mtmd_context {
1616 struct clip_ctx * ctx_clip;
1717 const struct llama_model * text_model;
1818 std::vector<float > image_embd_v; // image embedding vector
19+
1920 bool print_timings;
2021 int n_threads;
2122 std::string image_marker;
23+ bool calc_image_hash;
2224
2325 // TODO @ngxson : add timings
2426
2527 mtmd_context (const char * mmproj_fname,
2628 const llama_model * text_model,
27- const mtmd_context_params & ctx_params) : print_timings(ctx_params.print_timings), n_threads(ctx_params.n_threads), image_marker(ctx_params.image_marker) {
29+ const mtmd_context_params & ctx_params) :
30+ print_timings (ctx_params.print_timings),
31+ n_threads (ctx_params.n_threads),
32+ image_marker (ctx_params.image_marker),
33+ calc_image_hash (ctx_params.calc_image_hash)
34+ {
2835 clip_context_params ctx_clip_params;
2936 ctx_clip_params.use_gpu = ctx_params.use_gpu ;
3037 ctx_clip_params.verbosity = ctx_params.verbosity ;
@@ -49,6 +56,7 @@ struct mtmd_image_tokens {
4956 uint32_t ny; // number of tokens in y direction
5057 uint32_t n_tokens () const { return nx * ny; }
5158 clip_image_f32_batch batch_f32; // preprocessed image patches
59+ size_t image_hash = 0 ; // hash of the image, useful for KV cache tracking
5260};
5361
5462mtmd_context * mtmd_init_from_file (const char * mmproj_fname,
@@ -88,6 +96,16 @@ static std::vector<llama_token> mtmd_tokenize_text_internal(
8896 return result;
8997}
9098
99+ static uint64_t hash_vector_float (const std::vector<float > & vec) {
100+ uint64_t seed = vec.size ();
101+ std::hash<float > hasher;
102+ for (float val : vec) {
103+ // inspired by boost::hash_combine
104+ seed ^= hasher (val) + 0x9e3779b9 + (seed << 6 ) + (seed >> 2 );
105+ }
106+ return seed;
107+ }
108+
91109mtmd_input_chunks * mtmd_tokenize (mtmd_context * ctx,
92110 const mtmd_input_text & text,
93111 const std::vector<mtmd_bitmap> & bitmaps) {
@@ -153,6 +171,11 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
153171 image_tokens->ny = 1 ; // TODO
154172 image_tokens->batch_f32 = std::move (batch_f32);
155173
174+ // optionally calculate the hash
175+ if (ctx->calc_image_hash ) {
176+ image_tokens->image_hash = hash_vector_float (image_tokens->batch_f32 .entries [0 ]->buf );
177+ }
178+
156179 mtmd_input_chunk chunk{
157180 MTMD_INPUT_CHUNK_TYPE_IMAGE,
158181 {},
@@ -196,6 +219,10 @@ size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
196219 return image_tokens->ny ;
197220}
198221
222+ uint64_t mtmd_image_tokens_get_hash (const mtmd_image_tokens * image_tokens) {
223+ return image_tokens->image_hash ;
224+ }
225+
199226int32_t mtmd_encode (mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
200227 int n_mmproj_embd = clip_n_mmproj_embd (ctx->ctx_clip );
201228 ctx->image_embd_v .resize (image_tokens->n_tokens () * n_mmproj_embd);
0 commit comments