@@ -228,7 +228,7 @@ std::vector<int64_t> bucket_size_right(const std::vector<float>& fractional_coor
228228ov::Tensor prepare_vis_position_ids (
229229 const ov::Tensor& pixel_values,
230230 const ov::Tensor& patch_attention_mask,
231- const std::vector<HeightWidth > tgt_sizes,
231+ const std::vector<ImageSize > tgt_sizes,
232232 size_t patch_size,
233233 size_t num_patches_per_side
234234) {
@@ -283,7 +283,7 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o
283283 };
284284 std::vector<std::vector<clip_image_u8>> imgs = ::slice_image (source, max_slice_nums, scale_resolution, patch_size, never_split);
285285 std::vector<std::vector<ov::Tensor>> results;
286- std::vector<std::vector<HeightWidth >> sizes;
286+ std::vector<std::vector<ImageSize >> sizes;
287287
288288 // std::vector<clip_image_f32*> img_res_v; // format N x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
289289 std::vector<std::vector<clip_image_f32>> preprocessed{imgs.size ()};
@@ -296,7 +296,7 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o
296296 });
297297
298298 const clip_image_f32& resized_preprocessed = preprocessed.at (0 ).at (0 );
299- HeightWidth resized_source_size{resized_preprocessed.ny / patch_size, resized_preprocessed.nx / patch_size};
299+ ImageSize resized_source_size{resized_preprocessed.ny / patch_size, resized_preprocessed.nx / patch_size};
300300 ov::Tensor input_tensor{ov::element::f32 , {1 , 3 , size_t (resized_preprocessed.ny ), size_t (resized_preprocessed.nx )}, (void *)(resized_preprocessed.buf .data ())};
301301 ov::Tensor pixel_values = preprocess_for_encoder (input_tensor, patch_size);
302302 encoder.set_tensor (" pixel_values" , pixel_values);
@@ -314,35 +314,53 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o
314314 return {std::move (resized_source), resized_source_size};
315315 }
316316
317- HeightWidth size {
317+ ImageSize raw_size {
318318 size_t (preprocessed.at (1 ).at (0 ).ny ),
319319 size_t (preprocessed.at (1 ).at (0 ).nx )
320320 };
321- std::vector<HeightWidth> sliced_sizes;
322- size_t n_patches = size.height / patch_size * size.width / patch_size,
321+ ImageSize slices_size{
322+ raw_size.height / patch_size,
323+ raw_size.width / patch_size
324+ };
325+ size_t n_patches = slices_size.height * slices_size.width ,
323326 old_hidden_size = resized_source.get_shape ().at (2 );
324327 ov::Tensor encoded_slices{ov::element::f32 , {preprocessed.size () - 1 , preprocessed.at (1 ).size (), n_patches, old_hidden_size}};
325328 for (size_t row = 1 ; row < preprocessed.size (); ++row) {
326329 for (size_t col = 0 ; col < preprocessed.at (row).size (); ++col) {
327330 clip_image_f32& elem = preprocessed.at (row).at (col);
328- sliced_sizes.push_back ({elem.ny / patch_size, elem.nx / patch_size});
329331 ov::Tensor pixel_values = preprocess_for_encoder (
330332 {ov::element::f32 , {1 , 3 , size_t (elem.ny ), size_t (elem.nx )}, elem.buf .data ()},
331333 patch_size
332334 );
333335 encoder.set_tensor (" pixel_values" , pixel_values);
334- ov::Tensor patch_attention_mask{ov::element::boolean, {1 , 1 , sliced_sizes. back (). height * sliced_sizes. back () .width }};
336+ ov::Tensor patch_attention_mask{ov::element::boolean, {1 , 1 , slices_size. height * slices_size .width }};
335337 std::fill_n (patch_attention_mask.data <bool >(), patch_attention_mask.get_size (), true );
336338 encoder.set_tensor (" patch_attention_mask" , patch_attention_mask);
337- ov::Tensor position_ids = prepare_vis_position_ids (pixel_values, patch_attention_mask, {sliced_sizes. back () }, ctx_clip.patch_size , ctx_clip.image_size / ctx_clip.patch_size );
339+ ov::Tensor position_ids = prepare_vis_position_ids (pixel_values, patch_attention_mask, {slices_size }, ctx_clip.patch_size , ctx_clip.image_size / ctx_clip.patch_size );
338340 encoder.set_tensor (" position_ids" , position_ids);
339341 const ov::Tensor& old = encoder.get_output_tensor ();
340342 encoder.set_output_tensor ({ov::element::f32 , {1 , n_patches, old_hidden_size}, encoded_slices.data <float >() + ((row - 1 ) * preprocessed.at (row).size () + col) * n_patches * old_hidden_size});
341343 encoder.infer ();
342344 encoder.set_output_tensor (old);
343345 }
344346 }
345- return {resized_source, resized_source_size, encoded_slices, sliced_sizes};
347+ return {resized_source, resized_source_size, encoded_slices, slices_size};
348+ }
349+
350+ ProcessorConfig from_any_map (
351+ const ov::AnyMap& config_map,
352+ const ProcessorConfig& initial
353+ ) {
354+ auto iter = config_map.find (" processor_config" );
355+ ProcessorConfig extracted_config = config_map.end () != iter ?
356+ iter->second .as <ProcessorConfig>() : initial;
357+ using utils::read_anymap_param;
358+ read_anymap_param (config_map, " patch_size" , extracted_config.patch_size );
359+ read_anymap_param (config_map, " scale_resolution" , extracted_config.scale_resolution );
360+ read_anymap_param (config_map, " max_slice_nums" , extracted_config.max_slice_nums );
361+ read_anymap_param (config_map, " norm_mean" , extracted_config.norm_mean );
362+ read_anymap_param (config_map, " norm_std" , extracted_config.norm_std );
363+ return extracted_config;
346364}
347365}
348366
@@ -366,7 +384,7 @@ EncodedImage VisionEncoder::encode(const ov::Tensor& image, const ProcessorConfi
366384}
367385
368386EncodedImage VisionEncoder::encode (const ov::Tensor& image, const ov::AnyMap& config_map) {
369- return encode (image, utils:: from_any_map (
387+ return encode (image, from_any_map (
370388 config_map, m_processor_config
371389 ));
372390}
0 commit comments