Optimize CPU image preprocessing of Phi3 vision (#2999)

jade-cho · web-flow · commit ffa946a50b6e · 2025-11-14T08:01:16.000Z
## Description - Optimize the CPU implementation for image preprocessing for Phi-3 Vision model. - The optimized functions are bicubic_resize() and channels_first(). CVS-176394
diff --git a/src/cpp/src/visual_language/phi3_vision/classes.cpp b/src/cpp/src/visual_language/phi3_vision/classes.cpp
@@ -73,6 +73,93 @@ ov::Tensor HD_transform(const ov::Tensor& uint8, size_t num_crops) {
     return padding_336(ov::Tensor{ov::element::u8, {1, new_h, new_w, 3}, dst.buf.data()});
 }
 
+void bicubic_resize_phi3(const clip_image_u8& img, clip_image_u8& dst, int target_width, int target_height) {
+    const int nx = img.nx;
+    const int ny = img.ny;
+
+    dst.nx = target_width;
+    dst.ny = target_height;
+    dst.buf.resize(3 * target_width * target_height);
+
+    const float tx = static_cast<float>(nx) / static_cast<float>(target_width);
+    const float ty = static_cast<float>(ny) / static_cast<float>(target_height);
+
+    constexpr float _1_3 = 1.0f / 3.0f;
+    constexpr float _1_6 = 1.0f / 6.0f;
+
+    float pixels[4];
+
+    auto clip_coord = [](int x, int lower, int upper) -> int {
+        return std::max(lower, std::min(x, upper));
+    };
+
+    for (int i = 0; i < target_height; i++) {
+        const float fy = ty * i;
+        const int y = static_cast<int>(fy);
+        const float dy = fy - y;
+
+        const int y_coords[4] = {
+            clip_coord(y - 1, 0, ny - 1),
+            clip_coord(y, 0, ny - 1),
+            clip_coord(y + 1, 0, ny - 1),
+            clip_coord(y + 2, 0, ny - 1)
+        };
+
+        for (int j = 0; j < target_width; j++) {
+            const float fx = tx * j;
+            const int x = static_cast<int>(fx);
+            const float dx = fx - x;
+
+            const int x_coords[4] = {
+                clip_coord(x - 1, 0, nx - 1),
+                clip_coord(x, 0, nx - 1),
+                clip_coord(x + 1, 0, nx - 1),
+                clip_coord(x + 2, 0, nx - 1)
+            };
+
+            const int dst_base_idx = (i * target_width + j) * 3;
+
+            for (int k = 0; k < 3; k++) {
+                for (int jj = 0; jj < 4; jj++) {
+                    const int row_base = y_coords[jj] * nx;
+                    const uint8_t* row_ptr = &img.buf[row_base * 3 + k];
+
+                    const float p[4] = {
+                        static_cast<float>(row_ptr[x_coords[0] * 3]),
+                        static_cast<float>(row_ptr[x_coords[1] * 3]),
+                        static_cast<float>(row_ptr[x_coords[2] * 3]),
+                        static_cast<float>(row_ptr[x_coords[3] * 3])
+                    };
+
+                    const float a0 = p[1];
+                    const float d0 = p[0] - a0;
+                    const float d2 = p[2] - a0;
+                    const float d3 = p[3] - a0;
+                    const float a1 = -_1_3 * d0 + d2 - _1_6 * d3;
+                    const float a2 = 0.5f * (d0 + d2);
+                    const float a3 = -_1_6 * d0 - 0.5f * d2 + _1_6 * d3;
+
+                    pixels[jj] = a0 + dx * (a1 + dx * (a2 + dx * a3));
+                }
+
+                const float a0 = pixels[1];
+                const float d0 = pixels[0] - a0;
+                const float d2 = pixels[2] - a0;
+                const float d3 = pixels[3] - a0;
+                const float a1 = -_1_3 * d0 + d2 - _1_6 * d3;
+                const float a2 = 0.5f * (d0 + d2);
+                const float a3 = -_1_6 * d0 - 0.5f * d2 + _1_6 * d3;
+
+                const float result = a0 + dy * (a1 + dy * (a2 + dy * a3));
+
+                dst.buf[dst_base_idx + k] = static_cast<uint8_t>(
+                    std::min(std::max(std::round(result), 0.0f), 255.0f)
+                );
+            }
+        }
+    }
+}
+
 ov::Tensor mean_scale(const ov::Tensor& uint8, const ProcessorConfig& config) {
     auto uint_8_data = uint8.data<uint8_t>();
     ov::Tensor float_normalized{ov::element::f32, uint8.get_shape()};
@@ -87,15 +174,30 @@ ov::Tensor mean_scale(const ov::Tensor& uint8, const ProcessorConfig& config) {
 }
 
 ov::Tensor channels_first(const ov::Tensor& _1hw3) {
-    ov::Shape shape = _1hw3.get_shape();
-    ov::Tensor _13hw = ov::Tensor{ov::element::f32, {1, 3, shape.at(1), shape.at(2)}};
-    auto _1hw3_data = _1hw3.data<float>();
+    const ov::Shape shape = _1hw3.get_shape();
+    const size_t height = shape.at(1);
+    const size_t width = shape.at(2);
+    const size_t hw = height * width;
+
+    ov::Tensor _13hw = ov::Tensor{ov::element::f32, {1, 3, height, width}};
+    const float* _1hw3_data = _1hw3.data<float>();
     float* _13hw_data = _13hw.data<float>();
-    for (size_t plane = 0; plane < 3; ++plane) {
-        for (size_t row = 0; row < shape.at(1); ++row) {
-            for (size_t col = 0; col < shape.at(2); ++col) {
-                _13hw_data[plane * shape.at(1) * shape.at(2) + row * shape.at(2) + col] = _1hw3_data[row * shape.at(2) * 3 + col * 3 + plane];
-            }
+
+    float* dst_channels[3] = {
+        _13hw_data,              // R channel
+        _13hw_data + hw,         // G channel
+        _13hw_data + 2 * hw      // B channel
+    };
+
+    for (size_t row = 0; row < height; ++row) {
+        const size_t row_offset = row * width;
+        const float* src_row = _1hw3_data + row_offset * 3;
+        for (size_t col = 0; col < width; ++col) {
+            const size_t dst_offset = row_offset + col;
+            const size_t src_offset = col * 3;
+            dst_channels[0][dst_offset] = src_row[src_offset];     // R
+            dst_channels[1][dst_offset] = src_row[src_offset + 1]; // G
+            dst_channels[2][dst_offset] = src_row[src_offset + 2]; // B
         }
     }
     return _13hw;
@@ -201,7 +303,7 @@ std::tuple<ov::Tensor, ImageSize> get_pixel_values_phi3_v(const ov::Tensor& imag
     ImageSize image_size{hd_image.get_shape().at(2), hd_image.get_shape().at(1)};
     clip_image_u8 img{int(hd_image.get_shape().at(2)), int(hd_image.get_shape().at(1)), {hd_image.data<uint8_t>(), hd_image.data<uint8_t>() + hd_image.get_size()}};
     clip_image_u8 dst;
-    bicubic_resize(img, dst, INPUT_IMAGE_SIZE, INPUT_IMAGE_SIZE);
+    bicubic_resize_phi3(img, dst, INPUT_IMAGE_SIZE, INPUT_IMAGE_SIZE);
     ov::Tensor global_image{ov::element::u8, {1, INPUT_IMAGE_SIZE, INPUT_IMAGE_SIZE, 3}, dst.buf.data()};
     global_image = mean_scale(global_image, config);
     hd_image = mean_scale(hd_image, config);