Clean up VLMPipeline (#923)

Wovchena · wenyi5608 · yangsu2022 · web-flow · commit d0f89da3f597 · 2024-10-09T18:22:23.000Z
Address comments in #912 --------- Co-authored-by: wenyi5608 <93560477+wenyi5608@users.noreply.github.com> Co-authored-by: Yang,Su <su.yang@intel.com> Co-authored-by: Yaroslav Tarkan <yaroslav.tarkan@intel.com> Co-authored-by: Alina Kladieva <alina.kladieva@intel.com> Co-authored-by: Pavel Esir <pavel.esir@intel.com> Co-authored-by: Pavel Esir <pavel.esir@gmail.com> Co-authored-by: Artur Paniukov <chgk1101@gmail.com> Co-authored-by: Ekaterina Aidova <ekaterina.aidova@intel.com> Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com> Co-authored-by: Mikhail Ryzhov <mikhail.ryzhov@intel.com>
diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
@@ -708,12 +708,12 @@ jobs:
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python ./samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py ./miniCPM-V-2_6/
-          wget https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11
+          wget https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 --output-document cat.jpg
           
       - name: Run chat chat sample
         run: >
           source ./ov/setupvars.sh
-          && timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./miniCPM-V-2_6/ d5fbbd1a-d484-415c-88cb-9986625b7b11
+          && timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./miniCPM-V-2_6/ cat.jpg
           <<< $'What is on the image?\nWhat is special on the image?'
 
   cpp-continuous-batching-ubuntu:
diff --git a/samples/cpp/visual_language_chat/README.md b/samples/cpp/visual_language_chat/README.md
@@ -15,7 +15,7 @@ export_MiniCPM-V-2_6.py miniCPM-V-2_6
 
 ## Run
 
-https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 can be used as a sample image.
+[This image](https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11) can be used as a sample image.
 
 `visual_language_chat miniCPM-V-2_6 319483352-d5fbbd1a-d484-415c-88cb-9986625b7b11.jpg`
 
diff --git a/samples/cpp/visual_language_chat/visual_language_chat.cpp b/samples/cpp/visual_language_chat/visual_language_chat.cpp
@@ -26,12 +26,10 @@ int main(int argc, char* argv[]) try {
 
     pipe.start_chat();
     std::cout << "question:\n";
-    if (!std::getline(std::cin, prompt)) {
-        throw std::runtime_error("std::cin failed");
-    }
+    std::getline(std::cin, prompt);
     pipe.generate(
         prompt,
-        ov::genai::image(std::move(image)),
+        ov::genai::image(image),
         ov::genai::streamer(print_subword)
     );
     std::cout << "\n----------\n"
diff --git a/src/cpp/include/openvino/genai/vision_encoder.hpp b/src/cpp/include/openvino/genai/vision_encoder.hpp
@@ -8,7 +8,7 @@
 
 namespace ov::genai {
 /// @brief A pair describing image size.
-struct HeightWidth {
+struct ImageSize {
     /// @brief Height of a corresponding image.
     size_t height;
     /// @brief Width of a corresponding image.
@@ -25,16 +25,16 @@ struct EncodedImage {
     ov::Tensor resized_source;
     /// @brief A size of an image used to compute embeddings for
     /// divided by ProcessorConfig's patch_size.
-    HeightWidth resized_source_size;
+    ImageSize resized_source_size;
     /// @brief Embeddings of images obtained from a source image by
     /// slicing at no more than max_slice_nums pieces and resizing.
     /// The tensor's shape is
     /// [slice_y, slice_x, number_of_embeddings, embedding_size].
     /// slices_sizes.size() == slice_y * slice_x.
     ov::Tensor slices;
-    /// @brief Flattened sizes of images used to compute embeddings
+    /// @brief A size of images used to compute embeddings
     /// stored in slices member divided by ProcessorConfig's patch_size.
-    std::vector<HeightWidth> slices_sizes;
+    ImageSize slices_size;
 };
 
 /// @brief A class used to infer embeddings of an image using
diff --git a/src/cpp/include/openvino/genai/vlm_pipeline.hpp b/src/cpp/include/openvino/genai/vlm_pipeline.hpp
@@ -65,37 +65,14 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
     explicit VLMPipeline(
         const std::filesystem::path& model_dir,
         const std::string& device="CPU",
-        const ov::AnyMap device_config={},
-        ov::Core core=ov::Core{}
-    ) : VLMPipeline{
-        model_dir,
-        Tokenizer(model_dir.string(), device_config),
-        device,
-        device_config,
-        core
-    } {}
-
-    /// @brief Construct a pipeline form a folder containing model IRs
-    /// and from a Tokenizer instance.
-    /// @param model_dir A folder to read model IRs.
-    /// @param tokenizer An instance of Tokenizer to use.
-    /// @param device Inference device.
-    /// @param device_config A config to pass to ov::Core.set_property()
-    /// and ov::Core::compile_model().
-    /// @param core ov::Core instance to use.
-    VLMPipeline(
-        const std::filesystem::path& model_dir,
-        const ov::genai::Tokenizer& tokenizer,
-        const std::string& device="CPU",
-        const ov::AnyMap device_config={},
-        ov::Core core=ov::Core{}
+        const ov::AnyMap device_config={}
     );
 
     /// @brief Default destructor.
     ~VLMPipeline();
 
     /// @brief Generate a response given a prompt and any number of
-    /// uint8 RGB images.
+    /// uint8 RGB images with [NCHW] or [CHW] layout.
     /// @param prompt A prompt to respond to.
     /// @param images Images to be prepended to a prompt.
     /// @param generation_config A config to follow for text generation.
@@ -120,7 +97,7 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
     /// @brief Generate a response given a prompt and arbitrary number
     /// of ov::Property instances.
     /// Example:
-    /// generate("text", image(std::move(rgb)), do_sample(true));
+    /// generate("text", image(rgb), do_sample(true));
     /// @param prompt A prompt to respond to.
     /// @param ...properties ov::Property instances to be combined into
     /// ov::AnyMap.
@@ -166,7 +143,7 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
 
 /*
  * utils that allow to use generate() in the following way:
- * pipe.generate(prompt, ov::genai::image(std::move(image_tensor))).
+ * pipe.generate(prompt, ov::genai::image(image_tensor)).
 */
 static constexpr ov::Property<ov::Tensor> image{"image"};
 static constexpr ov::Property<std::vector<ov::Tensor>> images{"images"};
diff --git a/src/cpp/src/clip.cpp b/src/cpp/src/clip.cpp
@@ -6,9 +6,6 @@
 // I'll gradually clean and extend it
 // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
 
-#define STB_IMAGE_IMPLEMENTATION
-#include "stb_image.hpp"
-
 #include <cassert>
 #include <cmath>
 #include <cstdlib>
diff --git a/src/cpp/src/clip.hpp b/src/cpp/src/clip.hpp
@@ -1,8 +1,7 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-#ifndef CLIP_H
-#define CLIP_H
+#pragma once
 
 #include <vector>
 #include <numeric>
@@ -53,4 +52,3 @@ bool bicubic_resize(const clip_image_u8& img, clip_image_u8& dst, int target_wid
 
 /** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */
 clip_image_f32 clip_image_preprocess(struct clip_ctx& ctx, const clip_image_u8& img);
-#endif // CLIP_H
diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp
@@ -211,6 +211,20 @@ std::pair<ov::AnyMap, ov::AnyMap> split_core_complile_config(const ov::AnyMap& p
     return {core_config, compile_config};
 };
 
+ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& minuend, const ov::genai::TokenizedInputs& subtrahend) {
+    auto minuend_size = minuend.input_ids.get_size();
+    auto subtrahend_size = subtrahend.input_ids.get_size();
+    ov::Shape new_shape{1, minuend_size - subtrahend_size};
+
+    ov::Tensor new_input_ids(ov::element::i64, new_shape);
+    auto data_ptr = minuend.input_ids.data<int64_t>();
+    std::copy(data_ptr + subtrahend_size, data_ptr + minuend_size, new_input_ids.data<int64_t>());
+
+    ov::Tensor new_attention_mask(ov::element::i64, new_shape);
+    std::fill_n(new_attention_mask.data<int64_t>(), new_shape[1], 1);
+
+    return {new_input_ids, new_attention_mask};
+}
 }  // namespace utils
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp
@@ -86,20 +86,7 @@ ProcessorConfig from_any_map(
 
 std::pair<ov::AnyMap, ov::AnyMap> split_core_complile_config(const ov::AnyMap& plugin_config);
 
-inline ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& fisrt, const ov::genai::TokenizedInputs& second){
-    auto first_size = fisrt.input_ids.get_size();
-    auto second_size = second.input_ids.get_size();
-    ov::Shape new_shape{1, first_size - second_size};
-
-    ov::Tensor new_input_ids(ov::element::i64, new_shape);
-    auto data_ptr = fisrt.input_ids.data<int64_t>();
-    std::copy(data_ptr + second_size, data_ptr + first_size, new_input_ids.data<int64_t>());
-
-    ov::Tensor new_attention_mask(ov::element::i64, new_shape);
-    std::fill_n(new_attention_mask.data<int64_t>(), new_shape[1], 1);
-
-    return {new_input_ids, new_attention_mask};
-}
+ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& minuend, const ov::genai::TokenizedInputs& subtrahend);
 }  // namespace utils
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/vision_encoder.cpp b/src/cpp/src/vision_encoder.cpp
@@ -228,7 +228,7 @@ std::vector<int64_t> bucket_size_right(const std::vector<float>& fractional_coor
 ov::Tensor prepare_vis_position_ids(
     const ov::Tensor& pixel_values,
     const ov::Tensor& patch_attention_mask,
-    const std::vector<HeightWidth> tgt_sizes,
+    const std::vector<ImageSize> tgt_sizes,
     size_t patch_size,
     size_t num_patches_per_side
 ) {
@@ -283,7 +283,7 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o
     };
     std::vector<std::vector<clip_image_u8>> imgs = ::slice_image(source, max_slice_nums, scale_resolution, patch_size, never_split);
     std::vector<std::vector<ov::Tensor>> results;
-    std::vector<std::vector<HeightWidth>> sizes;
+    std::vector<std::vector<ImageSize>> sizes;
 
     // std::vector<clip_image_f32*> img_res_v; // format N x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
     std::vector<std::vector<clip_image_f32>> preprocessed{imgs.size()};
@@ -296,7 +296,7 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o
     });
 
     const clip_image_f32& resized_preprocessed = preprocessed.at(0).at(0);
-    HeightWidth resized_source_size{resized_preprocessed.ny / patch_size, resized_preprocessed.nx / patch_size};
+    ImageSize resized_source_size{resized_preprocessed.ny / patch_size, resized_preprocessed.nx / patch_size};
     ov::Tensor input_tensor{ov::element::f32, {1, 3, size_t(resized_preprocessed.ny), size_t(resized_preprocessed.nx)}, (void*)(resized_preprocessed.buf.data())};
     ov::Tensor pixel_values = preprocess_for_encoder(input_tensor, patch_size);
     encoder.set_tensor("pixel_values", pixel_values);
@@ -314,35 +314,53 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o
         return {std::move(resized_source), resized_source_size};
     }
 
-    HeightWidth size{
+    ImageSize raw_size{
         size_t(preprocessed.at(1).at(0).ny),
         size_t(preprocessed.at(1).at(0).nx)
     };
-    std::vector<HeightWidth> sliced_sizes;
-    size_t n_patches = size.height / patch_size * size.width / patch_size,
+    ImageSize slices_size{
+        raw_size.height / patch_size,
+        raw_size.width / patch_size
+    };
+    size_t n_patches = slices_size.height * slices_size.width,
         old_hidden_size = resized_source.get_shape().at(2);
     ov::Tensor encoded_slices{ov::element::f32, {preprocessed.size() - 1, preprocessed.at(1).size(), n_patches, old_hidden_size}};
     for (size_t row = 1; row < preprocessed.size(); ++row) {
         for (size_t col = 0; col < preprocessed.at(row).size(); ++col) {
             clip_image_f32& elem = preprocessed.at(row).at(col);
-            sliced_sizes.push_back({elem.ny / patch_size, elem.nx / patch_size});
             ov::Tensor pixel_values = preprocess_for_encoder(
                 {ov::element::f32, {1, 3, size_t(elem.ny), size_t(elem.nx)}, elem.buf.data()},
                 patch_size
             );
             encoder.set_tensor("pixel_values", pixel_values);
-            ov::Tensor patch_attention_mask{ov::element::boolean, {1, 1, sliced_sizes.back().height * sliced_sizes.back().width}};
+            ov::Tensor patch_attention_mask{ov::element::boolean, {1, 1, slices_size.height * slices_size.width}};
             std::fill_n(patch_attention_mask.data<bool>(), patch_attention_mask.get_size(), true);
             encoder.set_tensor("patch_attention_mask", patch_attention_mask);
-            ov::Tensor position_ids = prepare_vis_position_ids(pixel_values, patch_attention_mask, {sliced_sizes.back()}, ctx_clip.patch_size, ctx_clip.image_size / ctx_clip.patch_size);
+            ov::Tensor position_ids = prepare_vis_position_ids(pixel_values, patch_attention_mask, {slices_size}, ctx_clip.patch_size, ctx_clip.image_size / ctx_clip.patch_size);
             encoder.set_tensor("position_ids", position_ids);
             const ov::Tensor& old = encoder.get_output_tensor();
             encoder.set_output_tensor({ov::element::f32, {1, n_patches, old_hidden_size}, encoded_slices.data<float>() + ((row - 1) * preprocessed.at(row).size() + col) * n_patches * old_hidden_size});
             encoder.infer();
             encoder.set_output_tensor(old);
         }
     }
-    return {resized_source, resized_source_size, encoded_slices, sliced_sizes};
+    return {resized_source, resized_source_size, encoded_slices, slices_size};
+}
+
+ProcessorConfig from_any_map(
+    const ov::AnyMap& config_map,
+    const ProcessorConfig& initial
+) {
+    auto iter = config_map.find("processor_config");
+    ProcessorConfig extracted_config = config_map.end() != iter ?
+        iter->second.as<ProcessorConfig>() : initial;
+    using utils::read_anymap_param;
+    read_anymap_param(config_map, "patch_size", extracted_config.patch_size);
+    read_anymap_param(config_map, "scale_resolution", extracted_config.scale_resolution);
+    read_anymap_param(config_map, "max_slice_nums", extracted_config.max_slice_nums);
+    read_anymap_param(config_map, "norm_mean", extracted_config.norm_mean);
+    read_anymap_param(config_map, "norm_std", extracted_config.norm_std);
+    return extracted_config;
 }
 }
 
@@ -366,7 +384,7 @@ EncodedImage VisionEncoder::encode(const ov::Tensor& image, const ProcessorConfi
 }
 
 EncodedImage VisionEncoder::encode(const ov::Tensor& image, const ov::AnyMap& config_map) {
-    return encode(image, utils::from_any_map(
+    return encode(image, from_any_map(
         config_map, m_processor_config
     ));
 }
diff --git a/src/cpp/src/vlm_pipeline.cpp b/src/cpp/src/vlm_pipeline.cpp
diff --git a/src/python/py_vlm_pipeline.cpp b/src/python/py_vlm_pipeline.cpp