Skip to content

Commit d0f89da

Browse files
Wovchenawenyi5608yangsu2022yatarkanakladiev
authored
Clean up VLMPipeline (#923)
Address comments in #912 --------- Co-authored-by: wenyi5608 <93560477+wenyi5608@users.noreply.github.com> Co-authored-by: Yang,Su <su.yang@intel.com> Co-authored-by: Yaroslav Tarkan <yaroslav.tarkan@intel.com> Co-authored-by: Alina Kladieva <alina.kladieva@intel.com> Co-authored-by: Pavel Esir <pavel.esir@intel.com> Co-authored-by: Pavel Esir <pavel.esir@gmail.com> Co-authored-by: Artur Paniukov <chgk1101@gmail.com> Co-authored-by: Ekaterina Aidova <ekaterina.aidova@intel.com> Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com> Co-authored-by: Mikhail Ryzhov <mikhail.ryzhov@intel.com>
1 parent 5018f73 commit d0f89da

File tree

12 files changed

+111
-127
lines changed

12 files changed

+111
-127
lines changed

.github/workflows/causal_lm_cpp.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -708,12 +708,12 @@ jobs:
708708
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
709709
python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
710710
python ./samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py ./miniCPM-V-2_6/
711-
wget https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11
711+
wget https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 --output-document cat.jpg
712712
713713
- name: Run chat chat sample
714714
run: >
715715
source ./ov/setupvars.sh
716-
&& timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./miniCPM-V-2_6/ d5fbbd1a-d484-415c-88cb-9986625b7b11
716+
&& timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./miniCPM-V-2_6/ cat.jpg
717717
<<< $'What is on the image?\nWhat is special on the image?'
718718
719719
cpp-continuous-batching-ubuntu:

samples/cpp/visual_language_chat/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ export_MiniCPM-V-2_6.py miniCPM-V-2_6
1515

1616
## Run
1717

18-
https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 can be used as a sample image.
18+
[This image](https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11) can be used as a sample image.
1919

2020
`visual_language_chat miniCPM-V-2_6 319483352-d5fbbd1a-d484-415c-88cb-9986625b7b11.jpg`
2121

samples/cpp/visual_language_chat/visual_language_chat.cpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,10 @@ int main(int argc, char* argv[]) try {
2626

2727
pipe.start_chat();
2828
std::cout << "question:\n";
29-
if (!std::getline(std::cin, prompt)) {
30-
throw std::runtime_error("std::cin failed");
31-
}
29+
std::getline(std::cin, prompt);
3230
pipe.generate(
3331
prompt,
34-
ov::genai::image(std::move(image)),
32+
ov::genai::image(image),
3533
ov::genai::streamer(print_subword)
3634
);
3735
std::cout << "\n----------\n"

src/cpp/include/openvino/genai/vision_encoder.hpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
namespace ov::genai {
1010
/// @brief A pair describing image size.
11-
struct HeightWidth {
11+
struct ImageSize {
1212
/// @brief Height of a corresponding image.
1313
size_t height;
1414
/// @brief Width of a corresponding image.
@@ -25,16 +25,16 @@ struct EncodedImage {
2525
ov::Tensor resized_source;
2626
/// @brief A size of an image used to compute embeddings for
2727
/// divided by ProcessorConfig's patch_size.
28-
HeightWidth resized_source_size;
28+
ImageSize resized_source_size;
2929
/// @brief Embeddings of images obtained from a source image by
3030
/// slicing at no more than max_slice_nums pieces and resizing.
3131
/// The tensor's shape is
3232
/// [slice_y, slice_x, number_of_embeddings, embedding_size].
3333
/// slices_sizes.size() == slice_y * slice_x.
3434
ov::Tensor slices;
35-
/// @brief Flattened sizes of images used to compute embeddings
35+
/// @brief A size of images used to compute embeddings
3636
/// stored in slices member divided by ProcessorConfig's patch_size.
37-
std::vector<HeightWidth> slices_sizes;
37+
ImageSize slices_size;
3838
};
3939

4040
/// @brief A class used to infer embeddings of an image using

src/cpp/include/openvino/genai/vlm_pipeline.hpp

Lines changed: 4 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -65,37 +65,14 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
6565
explicit VLMPipeline(
6666
const std::filesystem::path& model_dir,
6767
const std::string& device="CPU",
68-
const ov::AnyMap device_config={},
69-
ov::Core core=ov::Core{}
70-
) : VLMPipeline{
71-
model_dir,
72-
Tokenizer(model_dir.string(), device_config),
73-
device,
74-
device_config,
75-
core
76-
} {}
77-
78-
/// @brief Construct a pipeline form a folder containing model IRs
79-
/// and from a Tokenizer instance.
80-
/// @param model_dir A folder to read model IRs.
81-
/// @param tokenizer An instance of Tokenizer to use.
82-
/// @param device Inference device.
83-
/// @param device_config A config to pass to ov::Core.set_property()
84-
/// and ov::Core::compile_model().
85-
/// @param core ov::Core instance to use.
86-
VLMPipeline(
87-
const std::filesystem::path& model_dir,
88-
const ov::genai::Tokenizer& tokenizer,
89-
const std::string& device="CPU",
90-
const ov::AnyMap device_config={},
91-
ov::Core core=ov::Core{}
68+
const ov::AnyMap device_config={}
9269
);
9370

9471
/// @brief Default destructor.
9572
~VLMPipeline();
9673

9774
/// @brief Generate a response given a prompt and any number of
98-
/// uint8 RGB images.
75+
/// uint8 RGB images with [NCHW] or [CHW] layout.
9976
/// @param prompt A prompt to respond to.
10077
/// @param images Images to be prepended to a prompt.
10178
/// @param generation_config A config to follow for text generation.
@@ -120,7 +97,7 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
12097
/// @brief Generate a response given a prompt and arbitrary number
12198
/// of ov::Property instances.
12299
/// Example:
123-
/// generate("text", image(std::move(rgb)), do_sample(true));
100+
/// generate("text", image(rgb), do_sample(true));
124101
/// @param prompt A prompt to respond to.
125102
/// @param ...properties ov::Property instances to be combined into
126103
/// ov::AnyMap.
@@ -166,7 +143,7 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
166143

167144
/*
168145
* utils that allow to use generate() in the following way:
169-
* pipe.generate(prompt, ov::genai::image(std::move(image_tensor))).
146+
* pipe.generate(prompt, ov::genai::image(image_tensor)).
170147
*/
171148
static constexpr ov::Property<ov::Tensor> image{"image"};
172149
static constexpr ov::Property<std::vector<ov::Tensor>> images{"images"};

src/cpp/src/clip.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,6 @@
66
// I'll gradually clean and extend it
77
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
88

9-
#define STB_IMAGE_IMPLEMENTATION
10-
#include "stb_image.hpp"
11-
129
#include <cassert>
1310
#include <cmath>
1411
#include <cstdlib>

src/cpp/src/clip.hpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
// Copyright (C) 2023-2024 Intel Corporation
22
// SPDX-License-Identifier: Apache-2.0
33

4-
#ifndef CLIP_H
5-
#define CLIP_H
4+
#pragma once
65

76
#include <vector>
87
#include <numeric>
@@ -53,4 +52,3 @@ bool bicubic_resize(const clip_image_u8& img, clip_image_u8& dst, int target_wid
5352

5453
/** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */
5554
clip_image_f32 clip_image_preprocess(struct clip_ctx& ctx, const clip_image_u8& img);
56-
#endif // CLIP_H

src/cpp/src/utils.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,20 @@ std::pair<ov::AnyMap, ov::AnyMap> split_core_complile_config(const ov::AnyMap& p
211211
return {core_config, compile_config};
212212
};
213213

214+
ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& minuend, const ov::genai::TokenizedInputs& subtrahend) {
215+
auto minuend_size = minuend.input_ids.get_size();
216+
auto subtrahend_size = subtrahend.input_ids.get_size();
217+
ov::Shape new_shape{1, minuend_size - subtrahend_size};
218+
219+
ov::Tensor new_input_ids(ov::element::i64, new_shape);
220+
auto data_ptr = minuend.input_ids.data<int64_t>();
221+
std::copy(data_ptr + subtrahend_size, data_ptr + minuend_size, new_input_ids.data<int64_t>());
222+
223+
ov::Tensor new_attention_mask(ov::element::i64, new_shape);
224+
std::fill_n(new_attention_mask.data<int64_t>(), new_shape[1], 1);
225+
226+
return {new_input_ids, new_attention_mask};
227+
}
214228
} // namespace utils
215229
} // namespace genai
216230
} // namespace ov

src/cpp/src/utils.hpp

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -86,20 +86,7 @@ ProcessorConfig from_any_map(
8686

8787
std::pair<ov::AnyMap, ov::AnyMap> split_core_complile_config(const ov::AnyMap& plugin_config);
8888

89-
inline ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& fisrt, const ov::genai::TokenizedInputs& second){
90-
auto first_size = fisrt.input_ids.get_size();
91-
auto second_size = second.input_ids.get_size();
92-
ov::Shape new_shape{1, first_size - second_size};
93-
94-
ov::Tensor new_input_ids(ov::element::i64, new_shape);
95-
auto data_ptr = fisrt.input_ids.data<int64_t>();
96-
std::copy(data_ptr + second_size, data_ptr + first_size, new_input_ids.data<int64_t>());
97-
98-
ov::Tensor new_attention_mask(ov::element::i64, new_shape);
99-
std::fill_n(new_attention_mask.data<int64_t>(), new_shape[1], 1);
100-
101-
return {new_input_ids, new_attention_mask};
102-
}
89+
ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& minuend, const ov::genai::TokenizedInputs& subtrahend);
10390
} // namespace utils
10491
} // namespace genai
10592
} // namespace ov

src/cpp/src/vision_encoder.cpp

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,7 @@ std::vector<int64_t> bucket_size_right(const std::vector<float>& fractional_coor
228228
ov::Tensor prepare_vis_position_ids(
229229
const ov::Tensor& pixel_values,
230230
const ov::Tensor& patch_attention_mask,
231-
const std::vector<HeightWidth> tgt_sizes,
231+
const std::vector<ImageSize> tgt_sizes,
232232
size_t patch_size,
233233
size_t num_patches_per_side
234234
) {
@@ -283,7 +283,7 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o
283283
};
284284
std::vector<std::vector<clip_image_u8>> imgs = ::slice_image(source, max_slice_nums, scale_resolution, patch_size, never_split);
285285
std::vector<std::vector<ov::Tensor>> results;
286-
std::vector<std::vector<HeightWidth>> sizes;
286+
std::vector<std::vector<ImageSize>> sizes;
287287

288288
// std::vector<clip_image_f32*> img_res_v; // format N x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
289289
std::vector<std::vector<clip_image_f32>> preprocessed{imgs.size()};
@@ -296,7 +296,7 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o
296296
});
297297

298298
const clip_image_f32& resized_preprocessed = preprocessed.at(0).at(0);
299-
HeightWidth resized_source_size{resized_preprocessed.ny / patch_size, resized_preprocessed.nx / patch_size};
299+
ImageSize resized_source_size{resized_preprocessed.ny / patch_size, resized_preprocessed.nx / patch_size};
300300
ov::Tensor input_tensor{ov::element::f32, {1, 3, size_t(resized_preprocessed.ny), size_t(resized_preprocessed.nx)}, (void*)(resized_preprocessed.buf.data())};
301301
ov::Tensor pixel_values = preprocess_for_encoder(input_tensor, patch_size);
302302
encoder.set_tensor("pixel_values", pixel_values);
@@ -314,35 +314,53 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o
314314
return {std::move(resized_source), resized_source_size};
315315
}
316316

317-
HeightWidth size{
317+
ImageSize raw_size{
318318
size_t(preprocessed.at(1).at(0).ny),
319319
size_t(preprocessed.at(1).at(0).nx)
320320
};
321-
std::vector<HeightWidth> sliced_sizes;
322-
size_t n_patches = size.height / patch_size * size.width / patch_size,
321+
ImageSize slices_size{
322+
raw_size.height / patch_size,
323+
raw_size.width / patch_size
324+
};
325+
size_t n_patches = slices_size.height * slices_size.width,
323326
old_hidden_size = resized_source.get_shape().at(2);
324327
ov::Tensor encoded_slices{ov::element::f32, {preprocessed.size() - 1, preprocessed.at(1).size(), n_patches, old_hidden_size}};
325328
for (size_t row = 1; row < preprocessed.size(); ++row) {
326329
for (size_t col = 0; col < preprocessed.at(row).size(); ++col) {
327330
clip_image_f32& elem = preprocessed.at(row).at(col);
328-
sliced_sizes.push_back({elem.ny / patch_size, elem.nx / patch_size});
329331
ov::Tensor pixel_values = preprocess_for_encoder(
330332
{ov::element::f32, {1, 3, size_t(elem.ny), size_t(elem.nx)}, elem.buf.data()},
331333
patch_size
332334
);
333335
encoder.set_tensor("pixel_values", pixel_values);
334-
ov::Tensor patch_attention_mask{ov::element::boolean, {1, 1, sliced_sizes.back().height * sliced_sizes.back().width}};
336+
ov::Tensor patch_attention_mask{ov::element::boolean, {1, 1, slices_size.height * slices_size.width}};
335337
std::fill_n(patch_attention_mask.data<bool>(), patch_attention_mask.get_size(), true);
336338
encoder.set_tensor("patch_attention_mask", patch_attention_mask);
337-
ov::Tensor position_ids = prepare_vis_position_ids(pixel_values, patch_attention_mask, {sliced_sizes.back()}, ctx_clip.patch_size, ctx_clip.image_size / ctx_clip.patch_size);
339+
ov::Tensor position_ids = prepare_vis_position_ids(pixel_values, patch_attention_mask, {slices_size}, ctx_clip.patch_size, ctx_clip.image_size / ctx_clip.patch_size);
338340
encoder.set_tensor("position_ids", position_ids);
339341
const ov::Tensor& old = encoder.get_output_tensor();
340342
encoder.set_output_tensor({ov::element::f32, {1, n_patches, old_hidden_size}, encoded_slices.data<float>() + ((row - 1) * preprocessed.at(row).size() + col) * n_patches * old_hidden_size});
341343
encoder.infer();
342344
encoder.set_output_tensor(old);
343345
}
344346
}
345-
return {resized_source, resized_source_size, encoded_slices, sliced_sizes};
347+
return {resized_source, resized_source_size, encoded_slices, slices_size};
348+
}
349+
350+
ProcessorConfig from_any_map(
351+
const ov::AnyMap& config_map,
352+
const ProcessorConfig& initial
353+
) {
354+
auto iter = config_map.find("processor_config");
355+
ProcessorConfig extracted_config = config_map.end() != iter ?
356+
iter->second.as<ProcessorConfig>() : initial;
357+
using utils::read_anymap_param;
358+
read_anymap_param(config_map, "patch_size", extracted_config.patch_size);
359+
read_anymap_param(config_map, "scale_resolution", extracted_config.scale_resolution);
360+
read_anymap_param(config_map, "max_slice_nums", extracted_config.max_slice_nums);
361+
read_anymap_param(config_map, "norm_mean", extracted_config.norm_mean);
362+
read_anymap_param(config_map, "norm_std", extracted_config.norm_std);
363+
return extracted_config;
346364
}
347365
}
348366

@@ -366,7 +384,7 @@ EncodedImage VisionEncoder::encode(const ov::Tensor& image, const ProcessorConfi
366384
}
367385

368386
EncodedImage VisionEncoder::encode(const ov::Tensor& image, const ov::AnyMap& config_map) {
369-
return encode(image, utils::from_any_map(
387+
return encode(image, from_any_map(
370388
config_map, m_processor_config
371389
));
372390
}

0 commit comments

Comments
 (0)