@@ -201,6 +201,7 @@ struct clip_hparams {
201201 // legacy
202202 bool has_llava_projector = false ;
203203 int minicpmv_version = 0 ;
204+ int minicpmv_max_slice_nums = 9 ;
204205 int32_t minicpmv_query_num = 0 ; // MiniCPM-V query number
205206};
206207
@@ -3260,16 +3261,67 @@ struct llava_uhd {
32603261 const bool has_slices = original_size.width > slice_size || original_size.height > slice_size;
32613262 const bool has_pinpoints = !ctx->model .hparams .image_res_candidates .empty ();
32623263
3263- if (!has_slices) {
3264- // skip slicing logic
3265- res.overview_size = clip_image_size{slice_size, slice_size};
3266- res.refined_size = clip_image_size{0 , 0 };
3267- res.grid_size = clip_image_size{0 , 0 };
3264+ if (clip_is_minicpmv (ctx)) {
3265+ auto best_size = get_best_resize (original_size, slice_size, patch_size, !has_slices);
3266+ res.overview_size = best_size;
3267+
3268+ {
3269+ const int max_slice_nums = ctx->model .hparams .minicpmv_max_slice_nums ;
3270+ const float log_ratio = log ((float )original_width / original_height);
3271+ const float ratio = (float )original_width * original_height / (slice_size * slice_size);
3272+ const int multiple = fmin (ceil (ratio), max_slice_nums);
3273+
3274+ auto best_grid = get_best_grid (max_slice_nums, multiple, log_ratio);
3275+ auto refine_size = get_refine_size (original_size, best_grid, slice_size, patch_size, true );
3276+ res.grid_size = best_grid;
3277+ res.refined_size = refine_size;
3278+
3279+ LOG_DBG (" %s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n " ,
3280+ __func__, original_width, original_height,
3281+ res.overview_size .width , res.overview_size .height ,
3282+ res.refined_size .width , res.refined_size .height ,
3283+ res.grid_size .width , res.grid_size .height );
3284+
3285+ if (!has_slices || max_slice_nums == 0 ) {
3286+ return res;
3287+ }
3288+
3289+ int width = refine_size.width ;
3290+ int height = refine_size.height ;
3291+ int grid_x = int (width / best_grid.width );
3292+ int grid_y = int (height / best_grid.height );
3293+ for (int patches_y = 0 , ic = 0 ;
3294+ patches_y < refine_size.height && ic < best_grid.height ;
3295+ patches_y += grid_y, ic += 1 ) {
3296+ for (int patches_x = 0 , jc = 0 ;
3297+ patches_x < refine_size.width && jc < best_grid.width ;
3298+ patches_x += grid_x, jc += 1 ) {
3299+ slice_coordinates slice;
3300+ slice.x = patches_x;
3301+ slice.y = patches_y;
3302+ slice.size .width = grid_x;
3303+ slice.size .height = grid_y;
3304+ res.slices .push_back (slice);
3305+ LOG_DBG (" %s: slice %d: x=%d, y=%d, size=%dx%d\n " ,
3306+ __func__, (int )res.slices .size () - 1 ,
3307+ slice.x , slice.y , slice.size .width , slice.size .height );
3308+ }
3309+ }
3310+ }
32683311
32693312 return res;
32703313 }
3314+ else {
3315+ if (!has_slices) {
3316+ // skip slicing logic
3317+ res.overview_size = clip_image_size{slice_size, slice_size};
3318+ res.refined_size = clip_image_size{0 , 0 };
3319+ res.grid_size = clip_image_size{0 , 0 };
32713320
3272- if (has_pinpoints) {
3321+ return res;
3322+ }
3323+
3324+ if (has_pinpoints) {
32733325 // has pinpoints, use them to calculate the grid size (e.g. llava-1.6)
32743326 auto refine_size = llava_uhd::select_best_resolution (
32753327 original_size,
@@ -3305,53 +3357,7 @@ struct llava_uhd {
33053357
33063358 return res;
33073359 }
3308-
3309- // no pinpoints, dynamically calculate the grid size (e.g. minicpmv)
3310-
3311- auto best_size = get_best_resize (original_size, slice_size, patch_size, !has_slices);
3312- res.overview_size = best_size;
3313-
3314- {
3315- const int max_slice_nums = 9 ; // TODO: this is only used by minicpmv, maybe remove it
3316- const float log_ratio = log ((float )original_width / original_height);
3317- const float ratio = (float )original_width * original_height / (slice_size * slice_size);
3318- const int multiple = fmin (ceil (ratio), max_slice_nums);
3319-
3320- auto best_grid = get_best_grid (max_slice_nums, multiple, log_ratio);
3321- auto refine_size = get_refine_size (original_size, best_grid, slice_size, patch_size, true );
3322- res.grid_size = best_grid;
3323- res.refined_size = refine_size;
3324-
3325- LOG_DBG (" %s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n " ,
3326- __func__, original_width, original_height,
3327- res.overview_size .width , res.overview_size .height ,
3328- res.refined_size .width , res.refined_size .height ,
3329- res.grid_size .width , res.grid_size .height );
3330-
3331- int width = refine_size.width ;
3332- int height = refine_size.height ;
3333- int grid_x = int (width / best_grid.width );
3334- int grid_y = int (height / best_grid.height );
3335- for (int patches_y = 0 , ic = 0 ;
3336- patches_y < refine_size.height && ic < best_grid.height ;
3337- patches_y += grid_y, ic += 1 ) {
3338- for (int patches_x = 0 , jc = 0 ;
3339- patches_x < refine_size.width && jc < best_grid.width ;
3340- patches_x += grid_x, jc += 1 ) {
3341- slice_coordinates slice;
3342- slice.x = patches_x;
3343- slice.y = patches_y;
3344- slice.size .width = grid_x;
3345- slice.size .height = grid_y;
3346- res.slices .push_back (slice);
3347- LOG_DBG (" %s: slice %d: x=%d, y=%d, size=%dx%d\n " ,
3348- __func__, (int )res.slices .size () - 1 ,
3349- slice.x , slice.y , slice.size .width , slice.size .height );
3350- }
3351- }
33523360 }
3353-
3354- return res;
33553361 }
33563362
33573363 static std::vector<clip_image_u8_ptr> slice_image (const clip_image_u8 * img, const slice_instructions & inst) {
@@ -4390,6 +4396,12 @@ bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
43904396 || ctx->proj_type () == PROJECTOR_TYPE_VOXTRAL;
43914397}
43924398
4399+ void clip_set_minicpmv_max_slice_nums (struct clip_ctx * ctx, int n) {
4400+ if (!ctx) return ;
4401+ if (n < 0 ) n = 0 ;
4402+ ctx->model .hparams .minicpmv_max_slice_nums = n;
4403+ }
4404+
43934405bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
43944406 clip_image_f32 clip_img;
43954407 clip_img.buf .resize (h * w * 3 );
0 commit comments