Skip to content

Commit ffa946a

Browse files
authored
Optimize CPU image preprocessing of Phi3 vision (#2999)
## Description - Optimize the CPU implementation for image preprocessing for Phi-3 Vision model. - The optimized functions are bicubic_resize() and channels_first(). CVS-176394
1 parent 71c9269 commit ffa946a

File tree

1 file changed

+111
-9
lines changed

1 file changed

+111
-9
lines changed

src/cpp/src/visual_language/phi3_vision/classes.cpp

Lines changed: 111 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,93 @@ ov::Tensor HD_transform(const ov::Tensor& uint8, size_t num_crops) {
7373
return padding_336(ov::Tensor{ov::element::u8, {1, new_h, new_w, 3}, dst.buf.data()});
7474
}
7575

76+
void bicubic_resize_phi3(const clip_image_u8& img, clip_image_u8& dst, int target_width, int target_height) {
77+
const int nx = img.nx;
78+
const int ny = img.ny;
79+
80+
dst.nx = target_width;
81+
dst.ny = target_height;
82+
dst.buf.resize(3 * target_width * target_height);
83+
84+
const float tx = static_cast<float>(nx) / static_cast<float>(target_width);
85+
const float ty = static_cast<float>(ny) / static_cast<float>(target_height);
86+
87+
constexpr float _1_3 = 1.0f / 3.0f;
88+
constexpr float _1_6 = 1.0f / 6.0f;
89+
90+
float pixels[4];
91+
92+
auto clip_coord = [](int x, int lower, int upper) -> int {
93+
return std::max(lower, std::min(x, upper));
94+
};
95+
96+
for (int i = 0; i < target_height; i++) {
97+
const float fy = ty * i;
98+
const int y = static_cast<int>(fy);
99+
const float dy = fy - y;
100+
101+
const int y_coords[4] = {
102+
clip_coord(y - 1, 0, ny - 1),
103+
clip_coord(y, 0, ny - 1),
104+
clip_coord(y + 1, 0, ny - 1),
105+
clip_coord(y + 2, 0, ny - 1)
106+
};
107+
108+
for (int j = 0; j < target_width; j++) {
109+
const float fx = tx * j;
110+
const int x = static_cast<int>(fx);
111+
const float dx = fx - x;
112+
113+
const int x_coords[4] = {
114+
clip_coord(x - 1, 0, nx - 1),
115+
clip_coord(x, 0, nx - 1),
116+
clip_coord(x + 1, 0, nx - 1),
117+
clip_coord(x + 2, 0, nx - 1)
118+
};
119+
120+
const int dst_base_idx = (i * target_width + j) * 3;
121+
122+
for (int k = 0; k < 3; k++) {
123+
for (int jj = 0; jj < 4; jj++) {
124+
const int row_base = y_coords[jj] * nx;
125+
const uint8_t* row_ptr = &img.buf[row_base * 3 + k];
126+
127+
const float p[4] = {
128+
static_cast<float>(row_ptr[x_coords[0] * 3]),
129+
static_cast<float>(row_ptr[x_coords[1] * 3]),
130+
static_cast<float>(row_ptr[x_coords[2] * 3]),
131+
static_cast<float>(row_ptr[x_coords[3] * 3])
132+
};
133+
134+
const float a0 = p[1];
135+
const float d0 = p[0] - a0;
136+
const float d2 = p[2] - a0;
137+
const float d3 = p[3] - a0;
138+
const float a1 = -_1_3 * d0 + d2 - _1_6 * d3;
139+
const float a2 = 0.5f * (d0 + d2);
140+
const float a3 = -_1_6 * d0 - 0.5f * d2 + _1_6 * d3;
141+
142+
pixels[jj] = a0 + dx * (a1 + dx * (a2 + dx * a3));
143+
}
144+
145+
const float a0 = pixels[1];
146+
const float d0 = pixels[0] - a0;
147+
const float d2 = pixels[2] - a0;
148+
const float d3 = pixels[3] - a0;
149+
const float a1 = -_1_3 * d0 + d2 - _1_6 * d3;
150+
const float a2 = 0.5f * (d0 + d2);
151+
const float a3 = -_1_6 * d0 - 0.5f * d2 + _1_6 * d3;
152+
153+
const float result = a0 + dy * (a1 + dy * (a2 + dy * a3));
154+
155+
dst.buf[dst_base_idx + k] = static_cast<uint8_t>(
156+
std::min(std::max(std::round(result), 0.0f), 255.0f)
157+
);
158+
}
159+
}
160+
}
161+
}
162+
76163
ov::Tensor mean_scale(const ov::Tensor& uint8, const ProcessorConfig& config) {
77164
auto uint_8_data = uint8.data<uint8_t>();
78165
ov::Tensor float_normalized{ov::element::f32, uint8.get_shape()};
@@ -87,15 +174,30 @@ ov::Tensor mean_scale(const ov::Tensor& uint8, const ProcessorConfig& config) {
87174
}
88175

89176
ov::Tensor channels_first(const ov::Tensor& _1hw3) {
90-
ov::Shape shape = _1hw3.get_shape();
91-
ov::Tensor _13hw = ov::Tensor{ov::element::f32, {1, 3, shape.at(1), shape.at(2)}};
92-
auto _1hw3_data = _1hw3.data<float>();
177+
const ov::Shape shape = _1hw3.get_shape();
178+
const size_t height = shape.at(1);
179+
const size_t width = shape.at(2);
180+
const size_t hw = height * width;
181+
182+
ov::Tensor _13hw = ov::Tensor{ov::element::f32, {1, 3, height, width}};
183+
const float* _1hw3_data = _1hw3.data<float>();
93184
float* _13hw_data = _13hw.data<float>();
94-
for (size_t plane = 0; plane < 3; ++plane) {
95-
for (size_t row = 0; row < shape.at(1); ++row) {
96-
for (size_t col = 0; col < shape.at(2); ++col) {
97-
_13hw_data[plane * shape.at(1) * shape.at(2) + row * shape.at(2) + col] = _1hw3_data[row * shape.at(2) * 3 + col * 3 + plane];
98-
}
185+
186+
float* dst_channels[3] = {
187+
_13hw_data, // R channel
188+
_13hw_data + hw, // G channel
189+
_13hw_data + 2 * hw // B channel
190+
};
191+
192+
for (size_t row = 0; row < height; ++row) {
193+
const size_t row_offset = row * width;
194+
const float* src_row = _1hw3_data + row_offset * 3;
195+
for (size_t col = 0; col < width; ++col) {
196+
const size_t dst_offset = row_offset + col;
197+
const size_t src_offset = col * 3;
198+
dst_channels[0][dst_offset] = src_row[src_offset]; // R
199+
dst_channels[1][dst_offset] = src_row[src_offset + 1]; // G
200+
dst_channels[2][dst_offset] = src_row[src_offset + 2]; // B
99201
}
100202
}
101203
return _13hw;
@@ -201,7 +303,7 @@ std::tuple<ov::Tensor, ImageSize> get_pixel_values_phi3_v(const ov::Tensor& imag
201303
ImageSize image_size{hd_image.get_shape().at(2), hd_image.get_shape().at(1)};
202304
clip_image_u8 img{int(hd_image.get_shape().at(2)), int(hd_image.get_shape().at(1)), {hd_image.data<uint8_t>(), hd_image.data<uint8_t>() + hd_image.get_size()}};
203305
clip_image_u8 dst;
204-
bicubic_resize(img, dst, INPUT_IMAGE_SIZE, INPUT_IMAGE_SIZE);
306+
bicubic_resize_phi3(img, dst, INPUT_IMAGE_SIZE, INPUT_IMAGE_SIZE);
205307
ov::Tensor global_image{ov::element::u8, {1, INPUT_IMAGE_SIZE, INPUT_IMAGE_SIZE, 3}, dst.buf.data()};
206308
global_image = mean_scale(global_image, config);
207309
hd_image = mean_scale(hd_image, config);

0 commit comments

Comments
 (0)