Skip to content
Open
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
ca857ff
Video to text python sample.
popovaan Nov 19, 2025
3b3c69d
Sample test.
popovaan Nov 20, 2025
b4a84f7
Update samples/python/visual_language_chat/video_to_text_chat.py
popovaan Nov 20, 2025
29d78c4
Added c++ sample.
popovaan Nov 24, 2025
1a8944a
Attempt to add opencv build to ga workflow.
popovaan Nov 24, 2025
1a0d25c
Revert "Attempt to add opencv build to ga workflow."
popovaan Nov 24, 2025
4d070ab
Used FetchContent to add opencv.
popovaan Nov 25, 2025
a8fa911
Corrected test.
popovaan Nov 25, 2025
bdc6940
Convert path to string().
popovaan Nov 25, 2025
735060e
Updated readme.
popovaan Nov 25, 2025
5dfdcf7
Set 8 frames.
popovaan Nov 25, 2025
43c76c9
Update samples/cpp/visual_language_chat/README.md
popovaan Nov 25, 2025
d77276f
Fixed opencv version, minor corrections.
popovaan Nov 25, 2025
cabd763
Added assert.
popovaan Nov 25, 2025
a1c1290
Merge branch 'master' into video_to_text_sample
popovaan Nov 25, 2025
46e7d5d
Increase samples build timeout.
popovaan Nov 26, 2025
9f4e6b9
Merge branch 'video_to_text_sample' of https://github.com/popovaan/op…
popovaan Nov 26, 2025
5b6044d
Cmake corrected.
popovaan Nov 26, 2025
54cebe6
Attempt to fix ci.
popovaan Nov 27, 2025
e8cb51e
Fix on win.
popovaan Nov 27, 2025
58b8be4
Merge branch 'master' into video_to_text_sample
popovaan Nov 27, 2025
6dac23e
Apply suggestions from code review
popovaan Nov 28, 2025
faffe59
Attempt too fix error.
popovaan Nov 28, 2025
6e749ff
Attempt to fix cmake.
popovaan Nov 28, 2025
f71bb59
Attempt to fix.
popovaan Dec 1, 2025
cc49e6c
Merge master.
popovaan Dec 1, 2025
bbf700c
Change video.
popovaan Dec 1, 2025
40ac708
Set WITH_FFMPEG.
popovaan Dec 1, 2025
ace9a6a
Temporarily remove launching of cpp sample.
popovaan Dec 1, 2025
f8a3d0d
Returned cpp sample launch.
popovaan Dec 1, 2025
87c58f6
Add install ffmpeg.
popovaan Dec 1, 2025
5440d42
Minor correction.
popovaan Dec 2, 2025
96d4fa2
Added libs install needed by ffmpeg.
popovaan Dec 2, 2025
21127ed
Minor correction.
popovaan Dec 2, 2025
95ce2a3
Add debug info.
popovaan Dec 2, 2025
8f4457e
Attempt to fix.
popovaan Dec 2, 2025
b7c8dd2
Applied comments, removed debug print.
popovaan Dec 2, 2025
e11adbb
Attempt to fix.
popovaan Dec 2, 2025
b274aea
Increase timeout.
popovaan Dec 2, 2025
d423f8e
Removed not needed code.
popovaan Dec 2, 2025
931a072
Increase timeout.
popovaan Dec 2, 2025
e9fd0ce
Update samples/cpp/visual_language_chat/video_to_text_chat.cpp
popovaan Dec 3, 2025
2883d5a
Increase timeout for building win samples.
popovaan Dec 3, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,7 @@ jobs:
matrix:
build-type: [Release]
needs: [ openvino_download, genai_build_cmake ]
timeout-minutes: 10
timeout-minutes: 30
defaults:
run:
shell: bash
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/mac.yml
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ jobs:
matrix:
build-type: [Release]
needs: [ openvino_download, genai_build_cmake ]
timeout-minutes: 10
timeout-minutes: 30
defaults:
run:
shell: bash
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -487,7 +487,7 @@ jobs:
matrix:
build-type: [Release, Debug]
needs: [ openvino_download, genai_build_cpack ]
timeout-minutes: 10
timeout-minutes: 30
defaults:
run:
shell: pwsh
Expand Down
40 changes: 40 additions & 0 deletions samples/cpp/visual_language_chat/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
# Copyright (C) 2023-2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

if (MSVC)
set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
endif()

find_package(OpenVINOGenAI REQUIRED
PATHS
"${CMAKE_BINARY_DIR}" # Reuse the package from the build.
Expand Down Expand Up @@ -55,3 +59,39 @@ install(TARGETS benchmark_vlm
RUNTIME DESTINATION samples_bin/
COMPONENT samples_bin
EXCLUDE_FROM_ALL)


include(FetchContent)
FetchContent_Declare(
opencv
GIT_REPOSITORY https://github.com/opencv/opencv.git
GIT_TAG 4.11.0
GIT_SHALLOW TRUE
GIT_PROGRESS TRUE
)
FetchContent_MakeAvailable(opencv)


add_executable(video_to_text_chat video_to_text_chat.cpp)

target_include_directories(video_to_text_chat PRIVATE
${OPENCV_CONFIG_FILE_INCLUDE_DIR}
${OPENCV_MODULE_opencv_core_LOCATION}/include
${OPENCV_MODULE_opencv_videoio_LOCATION}/include
)
target_link_libraries(video_to_text_chat opencv_core opencv_videoio openvino::genai cxxopts::cxxopts)

set_target_properties(video_to_text_chat PROPERTIES
# Ensure out of box LC_RPATH on macOS with SIP
INSTALL_RPATH_USE_LINK_PATH ON)

install(TARGETS video_to_text_chat
RUNTIME DESTINATION samples_bin/
COMPONENT samples_bin
EXCLUDE_FROM_ALL)


install(DIRECTORY ${opencv_BINARY_DIR}/lib/
DESTINATION samples_bin/
COMPONENT samples_bin
EXCLUDE_FROM_ALL)
18 changes: 15 additions & 3 deletions samples/cpp/visual_language_chat/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
This example showcases inference of Visual language models (VLMs). The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample features `ov::genai::VLMPipeline` and runs the simplest deterministic greedy sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/minicpm-v-multimodal-chatbot) which provides an example of Visual-language assistant.


There are two sample files:
There are three sample files:
- [`visual_language_chat.cpp`](./visual_language_chat.cpp) demonstrates basic usage of the VLM pipeline.
- [`video_to_text_chat.cpp`](./video_to_text_chat.cpp) demonstrates video to text usage of the VLM pipeline.
- [`benchmark_vlm.cpp`](./benchmark_vlm.cpp) shows how to benchmark a VLM in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text and calculating various performance metrics.


Expand All @@ -19,9 +20,9 @@ pip install --upgrade-strategy eager -r ../../requirements.txt
optimum-cli export openvino --model openbmb/MiniCPM-V-2_6 --trust-remote-code MiniCPM-V-2_6
```

## Run
Follow [Get Started with Samples](https://docs.openvino.ai/2025/get-started/learn-openvino/openvino-samples/get-started-demos.html) to run samples.

Follow [Get Started with Samples](https://docs.openvino.ai/2025/get-started/learn-openvino/openvino-samples/get-started-demos.html) to run the sample.
## Run visual language chat:

[This image](https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11) can be used as a sample image.

Expand All @@ -31,6 +32,17 @@ Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is

Refer to the [Supported Models](https://openvinotoolkit.github.io/openvino.genai/docs/supported-models/#visual-language-models-vlms) for more details.


## Run video to text chat:

A model that supports video input is required to run this sample, for example `llava-hf/LLaVA-NeXT-Video-7B-hf`.

[This video](https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/sample_demo_1.mp4) can be used as a sample video.

`video_to_text_chat ./LLaVA-NeXT-Video-7B-hf/ sample_demo_1.mp4`

Supported models with video input are listed in [this section](https://openvinotoolkit.github.io/openvino.genai/docs/use-cases/image-processing/#use-image-or-video-tags-in-prompt).

## Run benchmark:

```sh
Expand Down
126 changes: 126 additions & 0 deletions samples/cpp/visual_language_chat/video_to_text_chat.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#include <openvino/genai/visual_language/pipeline.hpp>
#include <opencv2/core.hpp>
#include <opencv2/videoio.hpp>
#include <iostream>
#include <filesystem>

namespace fs = std::filesystem;

std::vector<size_t> make_indices(size_t total_frames, size_t num_frames) {
std::vector<size_t> indices;
indices.reserve(num_frames);

auto step = float(total_frames) / num_frames;

for (size_t i = 0; i < num_frames; ++i) {
size_t idx = std::min(size_t(i * step), total_frames - 1);
indices.push_back(idx);
}

return indices;
}

ov::Tensor load_video(const std::filesystem::path& video_path, size_t num_frames = 8) {
cv::VideoCapture cap(video_path.string());

if (!cap.isOpened()) {
OPENVINO_THROW("Could not open the video file.");
}
size_t total_num_frames = cap.get(cv::CAP_PROP_FRAME_COUNT);
auto indices = make_indices(total_num_frames, num_frames);

std::vector<cv::Mat> frames;
cv::Mat frame;
size_t width = cap.get(cv::CAP_PROP_FRAME_WIDTH);
size_t height = cap.get(cv::CAP_PROP_FRAME_HEIGHT);
ov::Tensor video_tensor(ov::element::u8, ov::Shape{num_frames, height, width, 3});
auto video_tensor_data = video_tensor.data<uint8_t>();

size_t frame_idx = 0;
while (cap.read(frame)) {
OPENVINO_ASSERT(frame.cols == width && frame.rows == height && frame.channels() == 3);
if (std::find(indices.begin(), indices.end(), frame_idx) != indices.end()) {
memcpy(video_tensor_data, frame.data, frame.total() * 3 * sizeof(uint8_t));
video_tensor_data += frame.total() * 3;
}
frame_idx++;
}
OPENVINO_ASSERT(frame_idx == total_num_frames);

return video_tensor;
}

std::vector<ov::Tensor> load_videos(const std::filesystem::path& input_path) {
if (input_path.empty() || !fs::exists(input_path)) {
throw std::runtime_error{"Path to images is empty or does not exist."};
}
if (fs::is_directory(input_path)) {
std::set<fs::path> sorted_videos{fs::directory_iterator(input_path), fs::directory_iterator()};
std::vector<ov::Tensor> videos;
for (const fs::path& dir_entry : sorted_videos) {
videos.push_back(load_video(dir_entry));
}
return videos;
}
return {load_video(input_path)};
}

bool print_subword(std::string&& subword) {
return !(std::cout << subword << std::flush);
}

int main(int argc, char* argv[]) try {
if (argc < 3 || argc > 4) {
throw std::runtime_error(std::string{"Usage "} + argv[0] + " <MODEL_DIR> <VIDEO_FILE OR DIR_WITH_VIDEOS> <DEVICE>");
}

std::vector<ov::Tensor> videos = load_videos(argv[2]);

// GPU and NPU can be used as well.
// Note: If NPU is selected, only language model will be run on NPU
std::string device = (argc == 4) ? argv[3] : "CPU";
ov::AnyMap enable_compile_cache;
if (device == "GPU") {
// Cache compiled models on disk for GPU to save time on the
// next run. It's not beneficial for CPU.
enable_compile_cache.insert({ov::cache_dir("vlm_cache")});
}
ov::genai::VLMPipeline pipe(argv[1], device, enable_compile_cache);

ov::genai::GenerationConfig generation_config;
generation_config.max_new_tokens = 100;

std::string prompt;

pipe.start_chat();
std::cout << "question:\n";

std::getline(std::cin, prompt);
pipe.generate(prompt,
ov::genai::videos(videos),
ov::genai::generation_config(generation_config),
ov::genai::streamer(print_subword));
std::cout << "\n----------\n"
"question:\n";
while (std::getline(std::cin, prompt)) {
pipe.generate(prompt,
ov::genai::generation_config(generation_config),
ov::genai::streamer(print_subword));
std::cout << "\n----------\n"
"question:\n";
}
pipe.finish_chat();
} catch (const std::exception& error) {
try {
std::cerr << error.what() << '\n';
} catch (const std::ios_base::failure&) {}
return EXIT_FAILURE;
} catch (...) {
try {
std::cerr << "Non-exception object thrown\n";
} catch (const std::ios_base::failure&) {}
return EXIT_FAILURE;
}
1 change: 1 addition & 0 deletions samples/deployment-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ librosa==0.11.0 # For Whisper
pillow==12.0.0 # Image processing for VLMs
json5==0.12.1 # For ReAct
pydantic==2.12.5 # For Structured output json schema
opencv-python==4.12.0.88 # For video-to-text VLM sample
22 changes: 16 additions & 6 deletions samples/python/visual_language_chat/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@

This example showcases inference of text-generation Vision Language Models (VLMs): `miniCPM-V-2_6` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample features `openvino_genai.VLMPipeline` and configures it for the chat scenario. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/minicpm-v-multimodal-chatbot) which provides an example of Visual-language assistant.

There are two sample files:
There are three sample files:
- [`visual_language_chat.py`](./visual_language_chat.py) demonstrates basic usage of the VLM pipeline.
- [`video_to_text_chat.py`](./video_to_text_chat.py) demonstrates video to text usage of the VLM pipeline.
- [`benchmark_vlm.py`](./benchmark_vlm.py) shows how to benchmark a VLM in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text and calculating various performance metrics.

## Download and convert the model and tokenizers
Expand Down Expand Up @@ -38,20 +39,29 @@ tokenizer = AutoTokenizer.from_pretrained("openbmb/MiniCPM-V-2_6")
export_tokenizer(tokenizer, output_dir)
```

## Run:
Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pip install -r ../../deployment-requirements.txt` to run VLM samples.

[This image](https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11) can be used as a sample image.
## Run image-to-text chat sample:

Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pip install -r ../../deployment-requirements.txt` and then, run a sample:
[This image](https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11) can be used as a sample image.

`python visual_language_chat.py ./miniCPM-V-2_6/ 319483352-d5fbbd1a-d484-415c-88cb-9986625b7b11.jpg`

See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models.

## Run video-to-text chat sample:

A model that supports video input is required to run this sample, for example `llava-hf/LLaVA-NeXT-Video-7B-hf`.

[This video](https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/sample_demo_1.mp4) can be used as a sample video.

`python video_to_text_chat.py ./LLaVA-NeXT-Video-7B-hf/ sample_demo_1.mp4`

Supported models with video input are listed in [this section](https://openvinotoolkit.github.io/openvino.genai/docs/use-cases/image-processing/#use-image-or-video-tags-in-prompt).

Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. # TODO: examples of larger models
Modify the source code to change the device for inference to the GPU.

See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models.

## Run benchmark:

```sh
Expand Down
100 changes: 100 additions & 0 deletions samples/python/visual_language_chat/video_to_text_chat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
#!/usr/bin/env python3
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0


import argparse
import numpy as np
import cv2
import openvino_genai
from openvino import Tensor
from pathlib import Path


def streamer(subword: str) -> bool:
'''
Args:
subword: sub-word of the generated text.
Returns: Return flag corresponds whether generation should be stopped.
'''
print(subword, end='', flush=True)

# No value is returned as in this example we don't want to stop the generation in this method.
# "return None" will be treated the same as "return openvino_genai.StreamingStatus.RUNNING".


def read_video(path: str, num_frames: int = 8) -> Tensor:
'''
Args:
path: The path to the video.
num_frames: Number of frames sampled from the video.
Returns: the ov.Tensor containing the video.
'''
cap = cv2.VideoCapture(path)

frames = []

while cap.isOpened():
ret, frame = cap.read()
if not ret:
break

frames.append(np.array(frame))

indices = np.arange(0, len(frames), len(frames) / num_frames).astype(int)
frames = [frames[i] for i in indices]

return Tensor(frames)


def read_videos(path: str) -> list[Tensor]:
entry = Path(path)
if entry.is_dir():
return [read_video(str(file)) for file in sorted(entry.iterdir())]
return [read_video(path)]


def main():
parser = argparse.ArgumentParser()
parser.add_argument('model_dir', help="Path to the model directory")
parser.add_argument('video_dir', help="Path to a video file.")
parser.add_argument('device', nargs='?', default='CPU', help="Device to run the model on (default: CPU)")
args = parser.parse_args()

videos = read_videos(args.video_dir)

# GPU and NPU can be used as well.
# Note: If NPU is selected, only the language model will be run on the NPU.
enable_compile_cache = dict()
if args.device == "GPU":
# Cache compiled models on disk for GPU to save time on the next run.
# It's not beneficial for CPU.
enable_compile_cache["CACHE_DIR"] = "vlm_cache"

pipe = openvino_genai.VLMPipeline(args.model_dir, args.device, **enable_compile_cache)

config = openvino_genai.GenerationConfig()
config.max_new_tokens = 100

pipe.start_chat()
prompt = input('question:\n')
pipe.generate(prompt, videos=videos, generation_config=config, streamer=streamer)

while True:
try:
prompt = input("\n----------\n"
"question:\n")
except EOFError:
break
pipe.generate(prompt, generation_config=config, streamer=streamer)
pipe.finish_chat()


if __name__ == '__main__':
main()
Loading
Loading