From ae099ec8c28aa90eb1a5877b42e6835ad7a73eb9 Mon Sep 17 00:00:00 2001
From: tc-mb <caitianchi@modelbest.cn>
Date: Tue, 14 Oct 2025 14:47:58 +0800
Subject: [PATCH 01/13] add video

Signed-off-by: tc-mb <caitianchi@modelbest.cn>
---
 common/arg.cpp            |   7 +
 common/common.h           |   1 +
 tools/mtmd/CMakeLists.txt |  20 +++
 tools/mtmd/clip.cpp       | 116 +++++++++--------
 tools/mtmd/clip.h         |   1 +
 tools/mtmd/mtmd-cli.cpp   |  90 +++++++++++--
 tools/mtmd/mtmd-video.cpp | 261 ++++++++++++++++++++++++++++++++++++++
 tools/mtmd/mtmd-video.h   |  53 ++++++++
 tools/mtmd/mtmd.cpp       |   5 +
 tools/mtmd/mtmd.h         |   4 +
 10 files changed, 493 insertions(+), 65 deletions(-)
 create mode 100644 tools/mtmd/mtmd-video.cpp
 create mode 100644 tools/mtmd/mtmd-video.h

diff --git a/common/arg.cpp b/common/arg.cpp
index 2b5e509d1d538..5e1714da71a82 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2514,6 +2514,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.image.emplace_back(value);
         }
     ).set_examples({LLAMA_EXAMPLE_MTMD}));
+    add_opt(common_arg(
+        {"--video"}, "PATH",
+        "path to a video file (requires FFmpeg at build time) or a directory of frames; can be repeated.\n",
+        [](common_params & params, const std::string & value) {
+            params.video.emplace_back(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_MTMD}));
     if (llama_supports_rpc()) {
         add_opt(common_arg(
             {"--rpc"}, "SERVERS",
diff --git a/common/common.h b/common/common.h
index 83d44dbaa70cc..6b1bb19d7e49c 100644
--- a/common/common.h
+++ b/common/common.h
@@ -405,6 +405,7 @@ struct common_params {
     bool mmproj_use_gpu = true;     // use GPU for multimodal model
     bool no_mmproj = false;         // explicitly disable multimodal model
     std::vector<std::string> image; // path to image file(s)
+    std::vector<std::string> video; // path to video file(s) or frame directories
 
     // finetune
     struct lr_opt lr;
diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt
index 2381012a0d02f..e135d4727f0a4 100644
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@@ -5,12 +5,14 @@ find_package(Threads REQUIRED)
 add_library(mtmd
             mtmd.cpp
             mtmd-audio.cpp
+            mtmd-video.cpp
             mtmd.h
             clip.cpp
             clip.h
             clip-impl.h
             mtmd-helper.cpp
             mtmd-helper.h
+            mtmd-video.h
             )
 
 target_link_libraries     (mtmd PUBLIC ggml llama)
@@ -20,6 +22,23 @@ target_include_directories(mtmd PRIVATE ../..)
 target_include_directories(mtmd PRIVATE ../../vendor)
 target_compile_features   (mtmd PRIVATE cxx_std_17)
 
+# Optional FFmpeg support for video decoding
+option(MTMD_WITH_FFMPEG "Enable FFmpeg-based video decoding in mtmd-video" OFF)
+if (MTMD_WITH_FFMPEG)
+    find_package(PkgConfig QUIET)
+    if (PKG_CONFIG_FOUND)
+        pkg_check_modules(FFMPEG QUIET IMPORTED_TARGET libavformat libavcodec libswscale libavutil)
+        if (FFMPEG_FOUND)
+            target_link_libraries(mtmd PRIVATE PkgConfig::FFMPEG)
+            target_compile_definitions(mtmd PRIVATE MTMD_WITH_FFMPEG)
+        else()
+            message(WARNING "FFmpeg not found via pkg-config; MTMD_WITH_FFMPEG disabled")
+        endif()
+    else()
+        message(WARNING "pkg-config not found; MTMD_WITH_FFMPEG disabled")
+    endif()
+endif()
+
 if (BUILD_SHARED_LIBS)
     set_target_properties     (mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
     target_compile_definitions(mtmd PRIVATE LLAMA_BUILD)
@@ -29,6 +48,7 @@ endif()
 set(MTMD_PUBLIC_HEADERS
     ${CMAKE_CURRENT_SOURCE_DIR}/mtmd.h
     ${CMAKE_CURRENT_SOURCE_DIR}/mtmd-helper.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/mtmd-video.h
     )
 
 set_target_properties(mtmd
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 61420193daef0..0d25aeaf3d29a 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -201,6 +201,7 @@ struct clip_hparams {
     // legacy
     bool has_llava_projector = false;
     int minicpmv_version = 0;
+    int minicpmv_max_slice_nums = 9;
     int32_t minicpmv_query_num = 0;         // MiniCPM-V query number
 };
 
@@ -3260,16 +3261,67 @@ struct llava_uhd {
         const bool has_slices    = original_size.width > slice_size || original_size.height > slice_size;
         const bool has_pinpoints = !ctx->model.hparams.image_res_candidates.empty();
 
-        if (!has_slices) {
-            // skip slicing logic
-            res.overview_size = clip_image_size{slice_size, slice_size};
-            res.refined_size  = clip_image_size{0, 0};
-            res.grid_size     = clip_image_size{0, 0};
+        if (clip_is_minicpmv(ctx)) {
+            auto best_size    = get_best_resize(original_size, slice_size, patch_size, !has_slices);
+            res.overview_size = best_size;
+
+            {
+                const int max_slice_nums = ctx->model.hparams.minicpmv_max_slice_nums;
+                const float log_ratio = log((float)original_width / original_height);
+                const float ratio = (float)original_width * original_height / (slice_size * slice_size);
+                const int multiple = fmin(ceil(ratio), max_slice_nums);
+
+                auto best_grid   = get_best_grid(max_slice_nums, multiple, log_ratio);
+                auto refine_size = get_refine_size(original_size, best_grid, slice_size, patch_size, true);
+                res.grid_size    = best_grid;
+                res.refined_size = refine_size;
+
+                LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n",
+                        __func__, original_width, original_height,
+                        res.overview_size.width, res.overview_size.height,
+                        res.refined_size.width, res.refined_size.height,
+                        res.grid_size.width, res.grid_size.height);
+
+                if (!has_slices || max_slice_nums == 0) {
+                    return res;
+                }
+
+                int width  = refine_size.width;
+                int height = refine_size.height;
+                int grid_x = int(width  / best_grid.width);
+                int grid_y = int(height / best_grid.height);
+                for (int patches_y = 0,                    ic = 0;
+                        patches_y < refine_size.height && ic < best_grid.height;
+                        patches_y += grid_y,              ic += 1) {
+                    for (int patches_x = 0,                   jc = 0;
+                            patches_x < refine_size.width && jc < best_grid.width;
+                            patches_x += grid_x,             jc += 1) {
+                        slice_coordinates slice;
+                        slice.x = patches_x;
+                        slice.y = patches_y;
+                        slice.size.width  = grid_x;
+                        slice.size.height = grid_y;
+                        res.slices.push_back(slice);
+                        LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n",
+                                __func__, (int)res.slices.size() - 1,
+                                slice.x, slice.y, slice.size.width, slice.size.height);
+                    }
+                }
+            }
 
             return res;
         }
+        else {
+            if (!has_slices) {
+                // skip slicing logic
+                res.overview_size = clip_image_size{slice_size, slice_size};
+                res.refined_size  = clip_image_size{0, 0};
+                res.grid_size     = clip_image_size{0, 0};
 
-        if (has_pinpoints) {
+                return res;
+            }
+
+            if (has_pinpoints) {
             // has pinpoints, use them to calculate the grid size (e.g. llava-1.6)
             auto refine_size = llava_uhd::select_best_resolution(
                 original_size,
@@ -3305,53 +3357,7 @@ struct llava_uhd {
 
             return res;
         }
-
-        // no pinpoints, dynamically calculate the grid size (e.g. minicpmv)
-
-        auto best_size    = get_best_resize(original_size, slice_size, patch_size, !has_slices);
-        res.overview_size = best_size;
-
-        {
-            const int max_slice_nums = 9; // TODO: this is only used by minicpmv, maybe remove it
-            const float log_ratio = log((float)original_width / original_height);
-            const float ratio = (float)original_width * original_height / (slice_size * slice_size);
-            const int multiple = fmin(ceil(ratio), max_slice_nums);
-
-            auto best_grid   = get_best_grid(max_slice_nums, multiple, log_ratio);
-            auto refine_size = get_refine_size(original_size, best_grid, slice_size, patch_size, true);
-            res.grid_size    = best_grid;
-            res.refined_size = refine_size;
-
-            LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n",
-                    __func__, original_width, original_height,
-                    res.overview_size.width, res.overview_size.height,
-                    res.refined_size.width, res.refined_size.height,
-                    res.grid_size.width, res.grid_size.height);
-
-            int width  = refine_size.width;
-            int height = refine_size.height;
-            int grid_x = int(width  / best_grid.width);
-            int grid_y = int(height / best_grid.height);
-            for (int patches_y = 0,                    ic = 0;
-                    patches_y < refine_size.height && ic < best_grid.height;
-                    patches_y += grid_y,              ic += 1) {
-                for (int patches_x = 0,                   jc = 0;
-                        patches_x < refine_size.width && jc < best_grid.width;
-                        patches_x += grid_x,             jc += 1) {
-                    slice_coordinates slice;
-                    slice.x = patches_x;
-                    slice.y = patches_y;
-                    slice.size.width  = grid_x;
-                    slice.size.height = grid_y;
-                    res.slices.push_back(slice);
-                    LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n",
-                            __func__, (int)res.slices.size() - 1,
-                            slice.x, slice.y, slice.size.width, slice.size.height);
-                }
-            }
         }
-
-        return res;
     }
 
     static std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 * img, const slice_instructions & inst) {
@@ -4390,6 +4396,12 @@ bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
         || ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL;
 }
 
+void clip_set_minicpmv_max_slice_nums(struct clip_ctx * ctx, int n) {
+    if (!ctx) return;
+    if (n < 0) n = 0;
+    ctx->model.hparams.minicpmv_max_slice_nums = n;
+}
+
 bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
     clip_image_f32 clip_img;
     clip_img.buf.resize(h * w * 3);
diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h
index 3387cdbd36955..c67bcc3a13c99 100644
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@@ -91,6 +91,7 @@ bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_i
 bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
 
 int clip_is_minicpmv(const struct clip_ctx * ctx);
+void clip_set_minicpmv_max_slice_nums(struct clip_ctx * ctx, int n);
 bool clip_is_glm(const struct clip_ctx * ctx);
 bool clip_is_qwen2vl(const struct clip_ctx * ctx);
 bool clip_is_llava(const struct clip_ctx * ctx);
diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp
index 5fde6ca0c32ae..bad1dca3ae546 100644
--- a/tools/mtmd/mtmd-cli.cpp
+++ b/tools/mtmd/mtmd-cli.cpp
@@ -6,12 +6,15 @@
 #include "ggml.h"
 #include "console.h"
 #include "chat.h"
+#include "clip.h"
 #include "mtmd.h"
 #include "mtmd-helper.h"
+#include "mtmd-video.h"
 
 #include <vector>
 #include <limits.h>
 #include <cinttypes>
+#include <cstdlib>
 
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
 #include <signal.h>
@@ -158,6 +161,23 @@ struct mtmd_cli_context {
         bitmaps.entries.push_back(std::move(bmp));
         return true;
     }
+
+    // Load multiple frames from a directory as a "video" (sequence of images)
+    // Returns number of frames appended
+    size_t load_video_dir(const std::string & dir, int max_frames = 32, int stride = 1, bool recursive = false) {
+        mtmd_video::LoadVideoOptions opts;
+        opts.max_frames = max_frames;
+        opts.stride     = stride;
+        opts.recursive  = recursive;
+        return mtmd_video::append_frames_from_dir(ctx_vision.get(), dir, bitmaps, opts);
+    }
+
+    size_t load_video_path(const std::string & path, int max_frames = 32, int stride = 1) {
+        mtmd_video::LoadVideoOptions opts;
+        opts.max_frames = max_frames;
+        opts.stride     = stride;
+        return mtmd_video::append_frames_from_path(ctx_vision.get(), path, bitmaps, opts);
+    }
 };
 
 static int generate_response(mtmd_cli_context & ctx, int n_predict) {
@@ -266,7 +286,7 @@ int main(int argc, char ** argv) {
     mtmd_cli_context ctx(params);
     LOG("%s: loading model: %s\n", __func__, params.model.path.c_str());
 
-    bool is_single_turn = !params.prompt.empty() && !params.image.empty();
+    bool is_single_turn = !params.prompt.empty() && (!params.image.empty() || !params.video.empty());
 
     int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict;
 
@@ -290,19 +310,38 @@ int main(int argc, char ** argv) {
 
     if (is_single_turn) {
         g_is_generating = true;
-        if (params.prompt.find(mtmd_default_marker()) == std::string::npos) {
-            for (size_t i = 0; i < params.image.size(); i++) {
-                params.prompt += mtmd_default_marker();
-            }
-        }
-        common_chat_msg msg;
-        msg.role = "user";
-        msg.content = params.prompt;
+
+        // 1) load all media first
+        size_t n_loaded_media = 0;
         for (const auto & image : params.image) {
             if (!ctx.load_media(image)) {
                 return 1; // error is already printed by libmtmd
             }
+            n_loaded_media += 1;
+        }
+        for (const auto & vpath : params.video) {
+            // for video understanding: disable UHD slicing (overview only)
+            mtmd_set_minicpmv_max_slice_nums(ctx.ctx_vision.get(), 0);
+            size_t n = ctx.load_video_path(vpath, /*max_frames*/3, /*stride*/1);
+            if (n == 0) {
+                LOG_ERR("Unable to load video frames from %s\n", vpath.c_str());
+                return 1;
+            }
+            n_loaded_media += n;
+        }
+
+        // 2) build prompt content with correct number of markers
+        std::string prompt_content = params.prompt;
+        if (prompt_content.find(mtmd_default_marker()) == std::string::npos) {
+            for (size_t i = 0; i < n_loaded_media; i++) {
+                prompt_content += mtmd_default_marker();
+            }
         }
+
+        // 3) run
+        common_chat_msg msg;
+        msg.role = "user";
+        msg.content = prompt_content;
         if (eval_message(ctx, msg, true)) {
             return 1;
         }
@@ -318,6 +357,9 @@ int main(int argc, char ** argv) {
         if (mtmd_support_audio(ctx.ctx_vision.get())) {
             LOG("\n   /audio <path>    load an audio");
         }
+        if (mtmd_support_vision(ctx.ctx_vision.get())) {
+            LOG("\n   /video <dir>     load frames from a directory as a video");
+        }
         LOG("\n   /clear           clear the chat history");
         LOG("\n   /quit or /exit   exit the program");
         LOG("\n");
@@ -349,15 +391,37 @@ int main(int argc, char ** argv) {
             g_is_generating = true;
             bool is_image = line == "/image" || line.find("/image ") == 0;
             bool is_audio = line == "/audio" || line.find("/audio ") == 0;
-            if (is_image || is_audio) {
+            bool is_video = line == "/video" || line.find("/video ") == 0;
+            if (is_image || is_audio || is_video) {
                 if (line.size() < 8) {
                     LOG_ERR("ERR: Missing media filename\n");
                     continue;
                 }
                 std::string media_path = line.substr(7);
-                if (ctx.load_media(media_path)) {
-                    LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : "audio");
-                    content += mtmd_default_marker();
+                if (is_video) {
+                    // parse optional args: "/video <dir> [max_frames] [stride]"
+                    // simple split by spaces
+                    std::vector<std::string> parts = string_split(media_path, " ");
+                    std::string dir = parts.size() > 0 ? parts[0] : media_path;
+                    int max_frames = 32;
+                    int stride = 1;
+                    if (parts.size() > 1) max_frames = std::max(1, atoi(parts[1].c_str()));
+                    if (parts.size() > 2) stride     = std::max(1, atoi(parts[2].c_str()));
+                    size_t n = ctx.load_video_path(dir, max_frames, stride);
+                    if (n > 0) {
+                        LOG("%s video loaded with %zu frames\n", dir.c_str(), n);
+                        // add one marker per frame to match mtmd_tokenize expectations
+                        for (size_t i = 0; i < n; ++i) {
+                            content += mtmd_default_marker();
+                        }
+                    } else {
+                        LOG_ERR("ERR: failed to load video frames from %s\n", dir.c_str());
+                    }
+                } else {
+                    if (ctx.load_media(media_path)) {
+                        LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : "audio");
+                        content += mtmd_default_marker();
+                    }
                 }
                 // else, error is already printed by libmtmd
                 continue;
diff --git a/tools/mtmd/mtmd-video.cpp b/tools/mtmd/mtmd-video.cpp
new file mode 100644
index 0000000000000..43f2c3b0a381a
--- /dev/null
+++ b/tools/mtmd/mtmd-video.cpp
@@ -0,0 +1,261 @@
+#include "mtmd-video.h"
+#include "mtmd-helper.h"
+#include "clip.h"
+
+#include <algorithm>
+#include <string>
+#include <vector>
+#include <cstdio>
+#include <cerrno>
+#include <cstring>
+#include <memory>
+
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#   define NOMINMAX
+#endif
+#include <windows.h>
+#else
+#include <dirent.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#endif
+
+namespace {
+
+static bool has_image_ext(const std::string & name) {
+    auto lower = name;
+    std::transform(lower.begin(), lower.end(), lower.begin(), [](unsigned char c){ return (char)std::tolower(c); });
+    return lower.rfind(".jpg")  != std::string::npos ||
+           lower.rfind(".jpeg") != std::string::npos ||
+           lower.rfind(".png")  != std::string::npos ||
+           lower.rfind(".bmp")  != std::string::npos ||
+           lower.rfind(".gif")  != std::string::npos ||
+           lower.rfind(".webp") != std::string::npos;
+}
+
+static bool is_dir(const std::string & path) {
+#if defined(_WIN32)
+    DWORD attrs = GetFileAttributesA(path.c_str());
+    return (attrs != INVALID_FILE_ATTRIBUTES) && (attrs & FILE_ATTRIBUTE_DIRECTORY);
+#else
+    struct stat st;
+    if (stat(path.c_str(), &st) != 0) return false;
+    return S_ISDIR(st.st_mode);
+#endif
+}
+
+static void list_files(const std::string & dir, std::vector<std::string> & out, bool recursive) {
+#if defined(_WIN32)
+    std::string pattern = dir;
+    if (!pattern.empty() && pattern.back() != '/' && pattern.back() != '\\') pattern += "\\";
+    pattern += "*";
+    WIN32_FIND_DATAA ffd;
+    HANDLE hFind = FindFirstFileA(pattern.c_str(), &ffd);
+    if (hFind == INVALID_HANDLE_VALUE) return;
+    do {
+        std::string name = ffd.cFileName;
+        if (name == "." || name == "..") continue;
+        std::string path = dir;
+        if (!path.empty() && path.back() != '/' && path.back() != '\\') path += "\\";
+        path += name;
+        if (ffd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) {
+            if (recursive) list_files(path, out, recursive);
+        } else {
+            out.push_back(path);
+        }
+    } while (FindNextFileA(hFind, &ffd) != 0);
+    FindClose(hFind);
+#else
+    DIR * dp = opendir(dir.c_str());
+    if (!dp) return;
+    struct dirent * de;
+    while ((de = readdir(dp)) != nullptr) {
+        std::string name = de->d_name;
+        if (name == "." || name == "..") continue;
+        std::string path = dir + "/" + name;
+        if (is_dir(path)) {
+            if (recursive) list_files(path, out, recursive);
+        } else {
+            out.push_back(path);
+        }
+    }
+    closedir(dp);
+#endif
+}
+
+} // namespace
+
+namespace mtmd_video {
+
+bool load_frames_from_dir(mtmd_context * ctx,
+                          const std::string & dir_path,
+                          std::vector<mtmd::bitmap> & out_frames,
+                          const LoadVideoOptions & opts) {
+    if (!ctx || dir_path.empty() || !is_dir(dir_path)) {
+        return false;
+    }
+    // note: hparam-based control is applied inside clip.cpp; nothing to set globally here
+
+    std::vector<std::string> files;
+    list_files(dir_path, files, opts.recursive);
+    std::sort(files.begin(), files.end());
+
+    int stride = std::max(1, opts.stride);
+    int loaded = 0;
+    for (size_t i = 0; i < files.size(); i++) {
+        if ((int)i % stride != 0) continue;
+        const std::string & f = files[i];
+        if (!has_image_ext(f)) continue;
+        mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx, f.c_str()));
+        if (!bmp.ptr) continue;
+        out_frames.push_back(std::move(bmp));
+        loaded++;
+        if (opts.max_frames > 0 && loaded >= opts.max_frames) break;
+    }
+    return loaded > 0;
+}
+
+size_t append_frames_from_dir(mtmd_context * ctx,
+                              const std::string & dir_path,
+                              mtmd::bitmaps & dst,
+                              const LoadVideoOptions & opts) {
+    std::vector<mtmd::bitmap> frames;
+    if (!load_frames_from_dir(ctx, dir_path, frames, opts)) {
+        return 0;
+    }
+    size_t before = dst.entries.size();
+    for (auto & f : frames) dst.entries.push_back(std::move(f));
+    return dst.entries.size() - before;
+}
+
+// --- FFmpeg-based file decoding (optional) ---
+
+#ifdef MTMD_WITH_FFMPEG
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+extern "C" {
+#include <libavformat/avformat.h>
+#include <libavcodec/avcodec.h>
+#include <libswscale/swscale.h>
+}
+#pragma GCC diagnostic pop
+
+struct DecodedFrameRGBA {
+    int width;
+    int height;
+    std::vector<unsigned char> rgba; // size = width * height * 4
+};
+
+static bool decode_video_ffmpeg_to_rgba(const std::string & file,
+                                        std::vector<DecodedFrameRGBA> & frames,
+                                        int max_frames,
+                                        int stride) {
+    AVFormatContext * fmt = nullptr;
+    if (avformat_open_input(&fmt, file.c_str(), nullptr, nullptr) < 0) return false;
+    std::unique_ptr<AVFormatContext, void(*)(AVFormatContext*)> fmt_guard(fmt, [](AVFormatContext *f){ if (f) avformat_close_input(&f); });
+    if (avformat_find_stream_info(fmt, nullptr) < 0) return false;
+    int vstream = av_find_best_stream(fmt, AVMEDIA_TYPE_VIDEO, -1, -1, nullptr, 0);
+    if (vstream < 0) return false;
+    AVStream * st = fmt->streams[vstream];
+    const AVCodec * dec = avcodec_find_decoder(st->codecpar->codec_id);
+    if (!dec) return false;
+    AVCodecContext * ctx = avcodec_alloc_context3(dec);
+    if (!ctx) return false;
+    std::unique_ptr<AVCodecContext, void(*)(AVCodecContext*)> ctx_guard(ctx, [](AVCodecContext *c){ if (c) avcodec_free_context(&c); });
+    if (avcodec_parameters_to_context(ctx, st->codecpar) < 0) return false;
+    if (avcodec_open2(ctx, dec, nullptr) < 0) return false;
+
+    AVFrame * frame = av_frame_alloc();
+    AVPacket * pkt  = av_packet_alloc();
+    std::unique_ptr<AVFrame, void(*)(AVFrame*)> frame_guard(frame, [](AVFrame *f){ if (f) av_frame_free(&f); });
+    std::unique_ptr<AVPacket, void(*)(AVPacket*)> pkt_guard(pkt, [](AVPacket *p){ if (p) av_packet_free(&p); });
+
+    SwsContext * sws = nullptr;
+    int idx = 0;
+    int taken = 0;
+    while (av_read_frame(fmt, pkt) >= 0) {
+        if (pkt->stream_index != vstream) { av_packet_unref(pkt); continue; }
+        if (avcodec_send_packet(ctx, pkt) < 0) { av_packet_unref(pkt); break; }
+        av_packet_unref(pkt);
+        while (avcodec_receive_frame(ctx, frame) == 0) {
+            if (stride > 1 && (idx++ % stride != 0)) continue;
+            if (!sws) {
+                sws = sws_getContext(frame->width, frame->height, (AVPixelFormat)frame->format,
+                                     frame->width, frame->height, AV_PIX_FMT_RGBA,
+                                     SWS_BILINEAR, nullptr, nullptr, nullptr);
+                if (!sws) return false;
+            }
+            DecodedFrameRGBA out;
+            out.width = frame->width;
+            out.height = frame->height;
+            out.rgba.resize((size_t)frame->width * frame->height * 4);
+            uint8_t * dst_data[4] = { out.rgba.data(), nullptr, nullptr, nullptr };
+            int dst_linesize[4] = { frame->width * 4, 0, 0, 0 };
+            sws_scale(sws, frame->data, frame->linesize, 0, frame->height, dst_data, dst_linesize);
+            frames.push_back(std::move(out));
+            taken++;
+            if (max_frames > 0 && taken >= max_frames) break;
+        }
+        if (max_frames > 0 && taken >= max_frames) break;
+    }
+    if (sws) sws_freeContext(sws);
+    return taken > 0;
+}
+
+bool load_frames_from_file(mtmd_context * ctx,
+                           const std::string & file_path,
+                           std::vector<mtmd::bitmap> & out_frames,
+                           const LoadVideoOptions & opts) {
+    if (!ctx) return false;
+    std::vector<DecodedFrameRGBA> decoded;
+    if (!decode_video_ffmpeg_to_rgba(file_path, decoded, opts.max_frames, std::max(1, opts.stride))) {
+        return false;
+    }
+    for (auto & fr : decoded) {
+        const int w = fr.width;
+        const int h = fr.height;
+        std::vector<unsigned char> rgb;
+        rgb.resize((size_t)w * h * 3);
+        const unsigned char * src = fr.rgba.data();
+        unsigned char * dst = rgb.data();
+        for (int i = 0; i < w * h; ++i) {
+            dst[0] = src[0]; // R
+            dst[1] = src[1]; // G
+            dst[2] = src[2]; // B
+            dst += 3;
+            src += 4; // skip A
+        }
+        mtmd::bitmap bmp(mtmd_bitmap_init((uint32_t)w, (uint32_t)h, rgb.data()));
+        if (bmp.ptr) out_frames.push_back(std::move(bmp));
+    }
+    return !out_frames.empty();
+}
+#else
+bool load_frames_from_file(mtmd_context * /*ctx*/,
+                           const std::string & /*file_path*/,
+                           std::vector<mtmd::bitmap> & /*out_frames*/,
+                           const LoadVideoOptions & /*opts*/) {
+    return false;
+}
+#endif
+
+size_t append_frames_from_path(mtmd_context * ctx,
+                               const std::string & path,
+                               mtmd::bitmaps & dst,
+                               const LoadVideoOptions & opts) {
+    if (is_dir(path)) {
+        return append_frames_from_dir(ctx, path, dst, opts);
+    } else {
+        std::vector<mtmd::bitmap> frames;
+        if (!load_frames_from_file(ctx, path, frames, opts)) return 0;
+        size_t before = dst.entries.size();
+        for (auto & f : frames) dst.entries.push_back(std::move(f));
+        return dst.entries.size() - before;
+    }
+}
+
+} // namespace mtmd_video
+
+
diff --git a/tools/mtmd/mtmd-video.h b/tools/mtmd/mtmd-video.h
new file mode 100644
index 0000000000000..e36ef38f40155
--- /dev/null
+++ b/tools/mtmd/mtmd-video.h
@@ -0,0 +1,53 @@
+// mtmd-video helpers (experimental)
+// Minimal C++ helpers to load video frames (as image files) from a directory
+// and append them to mtmd::bitmaps for downstream tokenization.
+
+#ifndef MTMD_VIDEO_H
+#define MTMD_VIDEO_H
+
+#include "mtmd.h"
+
+#include <string>
+#include <vector>
+
+namespace mtmd_video {
+
+struct LoadVideoOptions {
+    int max_frames = 32;   // maximum frames to load (<= 0 means no limit)
+    int stride     = 1;    // take every N-th frame
+    bool recursive = false;// scan subdirectories
+};
+
+// Load frames from a directory of images (jpg/png/bmp/webp etc.)
+// Returns true on success (>=1 frame loaded), false otherwise.
+bool load_frames_from_dir(mtmd_context * ctx,
+                          const std::string & dir_path,
+                          std::vector<mtmd::bitmap> & out_frames,
+                          const LoadVideoOptions & opts = {});
+
+// Append frames loaded from a directory into mtmd::bitmaps container.
+// Returns number of frames appended (0 on failure).
+size_t append_frames_from_dir(mtmd_context * ctx,
+                              const std::string & dir_path,
+                              mtmd::bitmaps & dst,
+                              const LoadVideoOptions & opts = {});
+
+// Load frames from a video file via FFmpeg (mp4/mov/mkv/avi/webm...).
+// Returns true on success (>=1 frame loaded), false otherwise.
+bool load_frames_from_file(mtmd_context * ctx,
+                           const std::string & file_path,
+                           std::vector<mtmd::bitmap> & out_frames,
+                           const LoadVideoOptions & opts = {});
+
+// Append frames loaded from a file or directory (auto-detect).
+// Returns number of frames appended (0 on failure or unsupported input).
+size_t append_frames_from_path(mtmd_context * ctx,
+                               const std::string & path,
+                               mtmd::bitmaps & dst,
+                               const LoadVideoOptions & opts = {});
+
+} // namespace mtmd_video
+
+#endif // MTMD_VIDEO_H
+
+
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index cd022c5e245c0..6a186832742e5 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -833,6 +833,11 @@ int mtmd_get_audio_bitrate(mtmd_context * ctx) {
     return 16000; // 16kHz
 }
 
+void mtmd_set_minicpmv_max_slice_nums(mtmd_context * ctx, int n) {
+    if (!ctx || !ctx->ctx_v) return;
+    clip_set_minicpmv_max_slice_nums(ctx->ctx_v, n);
+}
+
 //
 // public API functions
 //
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
index f4ea07d3ad521..daf675e50f0b6 100644
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@@ -112,6 +112,10 @@ MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
 // return -1 if audio is not supported
 MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx);
 
+// set MiniCPM-V UHD slicing upper bound (used when preprocessing images for MiniCPM-V)
+// values < 1 will be clamped to 1
+MTMD_API void mtmd_set_minicpmv_max_slice_nums(mtmd_context * ctx, int n);
+
 // mtmd_bitmap
 //
 // if bitmap is image:

From d5b832defb4b84902cd9fa37dbb96173d4ac2392 Mon Sep 17 00:00:00 2001
From: qijia <2421653893@qq.com>
Date: Tue, 28 Oct 2025 18:15:54 +0800
Subject: [PATCH 02/13] feat: support minicpm-v video normal speed

---
 tools/mtmd/mtmd-cli.cpp   | 33 +++++-------------
 tools/mtmd/mtmd-video.cpp | 71 +++++++++++++++++++++++++++++++++++++--
 tools/mtmd/mtmd-video.h   | 12 +++++--
 3 files changed, 88 insertions(+), 28 deletions(-)

diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp
index bad1dca3ae546..724dc48d38d9a 100644
--- a/tools/mtmd/mtmd-cli.cpp
+++ b/tools/mtmd/mtmd-cli.cpp
@@ -162,21 +162,10 @@ struct mtmd_cli_context {
         return true;
     }
 
-    // Load multiple frames from a directory as a "video" (sequence of images)
+    // Load multiple frames from a video file or a directory as a "video" (sequence of images)
     // Returns number of frames appended
-    size_t load_video_dir(const std::string & dir, int max_frames = 32, int stride = 1, bool recursive = false) {
-        mtmd_video::LoadVideoOptions opts;
-        opts.max_frames = max_frames;
-        opts.stride     = stride;
-        opts.recursive  = recursive;
-        return mtmd_video::append_frames_from_dir(ctx_vision.get(), dir, bitmaps, opts);
-    }
-
-    size_t load_video_path(const std::string & path, int max_frames = 32, int stride = 1) {
-        mtmd_video::LoadVideoOptions opts;
-        opts.max_frames = max_frames;
-        opts.stride     = stride;
-        return mtmd_video::append_frames_from_path(ctx_vision.get(), path, bitmaps, opts);
+    size_t load_video(const std::string & path) {
+        return mtmd_video::append_frames_from_path(ctx_vision.get(), path, bitmaps);
     }
 };
 
@@ -322,7 +311,7 @@ int main(int argc, char ** argv) {
         for (const auto & vpath : params.video) {
             // for video understanding: disable UHD slicing (overview only)
             mtmd_set_minicpmv_max_slice_nums(ctx.ctx_vision.get(), 0);
-            size_t n = ctx.load_video_path(vpath, /*max_frames*/3, /*stride*/1);
+            size_t n = ctx.load_video(vpath);
             if (n == 0) {
                 LOG_ERR("Unable to load video frames from %s\n", vpath.c_str());
                 return 1;
@@ -399,23 +388,19 @@ int main(int argc, char ** argv) {
                 }
                 std::string media_path = line.substr(7);
                 if (is_video) {
-                    // parse optional args: "/video <dir> [max_frames] [stride]"
+                    // parse optional args: "/video <file/dir path>"
                     // simple split by spaces
                     std::vector<std::string> parts = string_split(media_path, " ");
-                    std::string dir = parts.size() > 0 ? parts[0] : media_path;
-                    int max_frames = 32;
-                    int stride = 1;
-                    if (parts.size() > 1) max_frames = std::max(1, atoi(parts[1].c_str()));
-                    if (parts.size() > 2) stride     = std::max(1, atoi(parts[2].c_str()));
-                    size_t n = ctx.load_video_path(dir, max_frames, stride);
+                    std::string path = parts.size() > 0 ? parts[0] : media_path;
+                    size_t n = ctx.load_video(path);
                     if (n > 0) {
-                        LOG("%s video loaded with %zu frames\n", dir.c_str(), n);
+                        LOG("%s video loaded with %zu frames\n", path.c_str(), n);
                         // add one marker per frame to match mtmd_tokenize expectations
                         for (size_t i = 0; i < n; ++i) {
                             content += mtmd_default_marker();
                         }
                     } else {
-                        LOG_ERR("ERR: failed to load video frames from %s\n", dir.c_str());
+                        LOG_ERR("ERR: failed to load video frames from %s\n", path.c_str());
                     }
                 } else {
                     if (ctx.load_media(media_path)) {
diff --git a/tools/mtmd/mtmd-video.cpp b/tools/mtmd/mtmd-video.cpp
index 43f2c3b0a381a..fb23e358cc1b2 100644
--- a/tools/mtmd/mtmd-video.cpp
+++ b/tools/mtmd/mtmd-video.cpp
@@ -9,6 +9,7 @@
 #include <cerrno>
 #include <cstring>
 #include <memory>
+#include <cmath>
 
 #if defined(_WIN32)
 #define WIN32_LEAN_AND_MEAN
@@ -148,6 +149,44 @@ struct DecodedFrameRGBA {
     std::vector<unsigned char> rgba; // size = width * height * 4
 };
 
+bool get_video_info_ffmpeg(const std::string &file, VideoInfo &info) {
+    AVFormatContext *fmt = nullptr;
+    if (avformat_open_input(&fmt, file.c_str(), nullptr, nullptr) < 0) {
+        return false;
+    }
+
+    std::unique_ptr<AVFormatContext, void(*)(AVFormatContext*)> fmt_guard(fmt, 
+        [](AVFormatContext *f){ if (f) {avformat_close_input(&f);} });
+
+    if (avformat_find_stream_info(fmt, nullptr) < 0) {
+        return false;
+    }
+
+    // find video stream
+    int vstream = av_find_best_stream(fmt, AVMEDIA_TYPE_VIDEO, -1, -1, nullptr, 0);
+    if (vstream < 0) {
+        return false;
+    }
+
+    AVStream *st = fmt->streams[vstream];
+
+    // get fps
+    if (st->avg_frame_rate.num > 0 && st->avg_frame_rate.den > 0){
+        info.fps = av_q2d(st->avg_frame_rate);
+    }else if (st->r_frame_rate.num > 0 && st->r_frame_rate.den > 0){
+        info.fps = av_q2d(st->r_frame_rate);
+    }
+    // get total frames
+    if (st->nb_frames > 0){
+        info.total_frames = st->nb_frames;
+    }else if (fmt->duration > 0 && info.fps > 0.0){
+        // estimate total frames if nb_frames is not available
+        info.total_frames = std::llround((fmt->duration / (double)AV_TIME_BASE) * info.fps);
+    }
+
+    return true;
+}
+
 static bool decode_video_ffmpeg_to_rgba(const std::string & file,
                                         std::vector<DecodedFrameRGBA> & frames,
                                         int max_frames,
@@ -239,12 +278,40 @@ bool load_frames_from_file(mtmd_context * /*ctx*/,
                            const LoadVideoOptions & /*opts*/) {
     return false;
 }
+bool get_video_info_ffmpeg(const std::string &file, VideoInfo &info) {
+    return false;
+}
 #endif
 
 size_t append_frames_from_path(mtmd_context * ctx,
                                const std::string & path,
-                               mtmd::bitmaps & dst,
-                               const LoadVideoOptions & opts) {
+                               mtmd::bitmaps & dst) {
+    mtmd_video::LoadVideoOptions opts;
+    opts.max_frames = 32;
+    opts.stride     = 1;
+    opts.recursive  = false;
+
+    auto info = mtmd_video::VideoInfo{};
+    if(is_dir(path)) {
+        info.fps = 1;
+        std::vector<std::string> files;
+        list_files(path, files, opts.recursive);
+        info.total_frames = files.size();
+    } else {
+        mtmd_video::get_video_info_ffmpeg(path, info);
+    }
+
+    // minicpm normal speed
+    const int32_t minicpmv_max_video_frames = 64;
+    opts.max_frames = minicpmv_max_video_frames;
+    if(info.total_frames > minicpmv_max_video_frames) {
+        // uniform sample
+        opts.stride = (int)std::ceil((double)info.total_frames / minicpmv_max_video_frames);
+    } else {
+        // 1 frame per second
+        opts.stride = (info.fps > 1.0) ? (int)std::ceil(info.fps) : 1;
+    }
+
     if (is_dir(path)) {
         return append_frames_from_dir(ctx, path, dst, opts);
     } else {
diff --git a/tools/mtmd/mtmd-video.h b/tools/mtmd/mtmd-video.h
index e36ef38f40155..26055511f27b1 100644
--- a/tools/mtmd/mtmd-video.h
+++ b/tools/mtmd/mtmd-video.h
@@ -18,6 +18,11 @@ struct LoadVideoOptions {
     bool recursive = false;// scan subdirectories
 };
 
+struct VideoInfo {
+    double fps = 0.0;           // frames per second
+    int64_t total_frames = 0;   // total number of frames
+};
+
 // Load frames from a directory of images (jpg/png/bmp/webp etc.)
 // Returns true on success (>=1 frame loaded), false otherwise.
 bool load_frames_from_dir(mtmd_context * ctx,
@@ -32,6 +37,10 @@ size_t append_frames_from_dir(mtmd_context * ctx,
                               mtmd::bitmaps & dst,
                               const LoadVideoOptions & opts = {});
 
+// get video info (fps, total frames) via FFmpeg (mp4/mov/mkv/avi/webm...).
+// Returns true on success, false otherwise.
+bool get_video_info_ffmpeg(const std::string &file, VideoInfo &info);
+
 // Load frames from a video file via FFmpeg (mp4/mov/mkv/avi/webm...).
 // Returns true on success (>=1 frame loaded), false otherwise.
 bool load_frames_from_file(mtmd_context * ctx,
@@ -43,8 +52,7 @@ bool load_frames_from_file(mtmd_context * ctx,
 // Returns number of frames appended (0 on failure or unsupported input).
 size_t append_frames_from_path(mtmd_context * ctx,
                                const std::string & path,
-                               mtmd::bitmaps & dst,
-                               const LoadVideoOptions & opts = {});
+                               mtmd::bitmaps & dst);
 
 } // namespace mtmd_video
 

From f09abc0182573c8aa06dfcf7e32cb8c0e74a3b45 Mon Sep 17 00:00:00 2001
From: qijia <2421653893@qq.com>
Date: Wed, 29 Oct 2025 10:46:34 +0800
Subject: [PATCH 03/13] fix: video normal speed uniform sample & check
 videoInfo valid

---
 tools/mtmd/mtmd-video.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/tools/mtmd/mtmd-video.cpp b/tools/mtmd/mtmd-video.cpp
index fb23e358cc1b2..976ef853b81c0 100644
--- a/tools/mtmd/mtmd-video.cpp
+++ b/tools/mtmd/mtmd-video.cpp
@@ -191,6 +191,7 @@ static bool decode_video_ffmpeg_to_rgba(const std::string & file,
                                         std::vector<DecodedFrameRGBA> & frames,
                                         int max_frames,
                                         int stride) {
+    if(stride <= 0 || max_frames <= 0) return false;
     AVFormatContext * fmt = nullptr;
     if (avformat_open_input(&fmt, file.c_str(), nullptr, nullptr) < 0) return false;
     std::unique_ptr<AVFormatContext, void(*)(AVFormatContext*)> fmt_guard(fmt, [](AVFormatContext *f){ if (f) avformat_close_input(&f); });
@@ -219,7 +220,7 @@ static bool decode_video_ffmpeg_to_rgba(const std::string & file,
         if (avcodec_send_packet(ctx, pkt) < 0) { av_packet_unref(pkt); break; }
         av_packet_unref(pkt);
         while (avcodec_receive_frame(ctx, frame) == 0) {
-            if (stride > 1 && (idx++ % stride != 0)) continue;
+            if (idx++ % stride != stride/2) continue;
             if (!sws) {
                 sws = sws_getContext(frame->width, frame->height, (AVPixelFormat)frame->format,
                                      frame->width, frame->height, AV_PIX_FMT_RGBA,
@@ -235,9 +236,9 @@ static bool decode_video_ffmpeg_to_rgba(const std::string & file,
             sws_scale(sws, frame->data, frame->linesize, 0, frame->height, dst_data, dst_linesize);
             frames.push_back(std::move(out));
             taken++;
-            if (max_frames > 0 && taken >= max_frames) break;
+            if (taken >= max_frames) break;
         }
-        if (max_frames > 0 && taken >= max_frames) break;
+        if (taken >= max_frames) break;
     }
     if (sws) sws_freeContext(sws);
     return taken > 0;
@@ -298,7 +299,9 @@ size_t append_frames_from_path(mtmd_context * ctx,
         list_files(path, files, opts.recursive);
         info.total_frames = files.size();
     } else {
-        mtmd_video::get_video_info_ffmpeg(path, info);
+        if(!mtmd_video::get_video_info_ffmpeg(path, info)) {
+            return 0;
+        }
     }
 
     // minicpm normal speed

From e3776869eee36ffb2d74d1ccb02052981120ab24 Mon Sep 17 00:00:00 2001
From: qijia <2421653893@qq.com>
Date: Wed, 29 Oct 2025 17:47:25 +0800
Subject: [PATCH 04/13] refactor: support video in media

---
 tools/mtmd/mtmd-cli.cpp    |  41 +-----
 tools/mtmd/mtmd-helper.cpp |   6 +
 tools/mtmd/mtmd-video.cpp  | 111 ++++++++-------
 tools/mtmd/mtmd-video.h    |  35 ++---
 tools/mtmd/mtmd.cpp        | 267 ++++++++++++++++++++++++-------------
 tools/mtmd/mtmd.h          |   6 +
 6 files changed, 262 insertions(+), 204 deletions(-)

diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp
index 724dc48d38d9a..fb5b666da07b1 100644
--- a/tools/mtmd/mtmd-cli.cpp
+++ b/tools/mtmd/mtmd-cli.cpp
@@ -161,12 +161,6 @@ struct mtmd_cli_context {
         bitmaps.entries.push_back(std::move(bmp));
         return true;
     }
-
-    // Load multiple frames from a video file or a directory as a "video" (sequence of images)
-    // Returns number of frames appended
-    size_t load_video(const std::string & path) {
-        return mtmd_video::append_frames_from_path(ctx_vision.get(), path, bitmaps);
-    }
 };
 
 static int generate_response(mtmd_cli_context & ctx, int n_predict) {
@@ -309,14 +303,10 @@ int main(int argc, char ** argv) {
             n_loaded_media += 1;
         }
         for (const auto & vpath : params.video) {
-            // for video understanding: disable UHD slicing (overview only)
-            mtmd_set_minicpmv_max_slice_nums(ctx.ctx_vision.get(), 0);
-            size_t n = ctx.load_video(vpath);
-            if (n == 0) {
-                LOG_ERR("Unable to load video frames from %s\n", vpath.c_str());
-                return 1;
+            if (!ctx.load_media(vpath)) {
+                return 1; // error is already printed by libmtmd
             }
-            n_loaded_media += n;
+            n_loaded_media += 1;
         }
 
         // 2) build prompt content with correct number of markers
@@ -347,7 +337,7 @@ int main(int argc, char ** argv) {
             LOG("\n   /audio <path>    load an audio");
         }
         if (mtmd_support_vision(ctx.ctx_vision.get())) {
-            LOG("\n   /video <dir>     load frames from a directory as a video");
+            LOG("\n   /video <path>     load a video");
         }
         LOG("\n   /clear           clear the chat history");
         LOG("\n   /quit or /exit   exit the program");
@@ -387,26 +377,9 @@ int main(int argc, char ** argv) {
                     continue;
                 }
                 std::string media_path = line.substr(7);
-                if (is_video) {
-                    // parse optional args: "/video <file/dir path>"
-                    // simple split by spaces
-                    std::vector<std::string> parts = string_split(media_path, " ");
-                    std::string path = parts.size() > 0 ? parts[0] : media_path;
-                    size_t n = ctx.load_video(path);
-                    if (n > 0) {
-                        LOG("%s video loaded with %zu frames\n", path.c_str(), n);
-                        // add one marker per frame to match mtmd_tokenize expectations
-                        for (size_t i = 0; i < n; ++i) {
-                            content += mtmd_default_marker();
-                        }
-                    } else {
-                        LOG_ERR("ERR: failed to load video frames from %s\n", path.c_str());
-                    }
-                } else {
-                    if (ctx.load_media(media_path)) {
-                        LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : "audio");
-                        content += mtmd_default_marker();
-                    }
+                if (ctx.load_media(media_path)) {
+                    LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : (is_audio ? "audio" : "video"));
+                    content += mtmd_default_marker();
                 }
                 // else, error is already printed by libmtmd
                 continue;
diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp
index 686f42f3960fe..46c85ecd3614d 100644
--- a/tools/mtmd/mtmd-helper.cpp
+++ b/tools/mtmd/mtmd-helper.cpp
@@ -11,6 +11,8 @@
 #include "mtmd-helper.h"
 #include "llama.h"
 
+#include "mtmd-video.h"
+
 #include <algorithm>
 #include <cinttypes>
 #include <vector>
@@ -437,6 +439,10 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigne
 }
 
 mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname) {
+    if(mtmd_video::is_video_file(fname)){
+        return mtmd_video::init_video_bitmap_from_path(ctx, fname);
+    }
+
     std::vector<unsigned char> buf;
     FILE * f = fopen(fname, "rb");
     if (!f) {
diff --git a/tools/mtmd/mtmd-video.cpp b/tools/mtmd/mtmd-video.cpp
index 976ef853b81c0..e9d6aea366653 100644
--- a/tools/mtmd/mtmd-video.cpp
+++ b/tools/mtmd/mtmd-video.cpp
@@ -90,12 +90,22 @@ static void list_files(const std::string & dir, std::vector<std::string> & out,
 
 namespace mtmd_video {
 
-bool load_frames_from_dir(mtmd_context * ctx,
+bool is_video_file(const std::string & path){
+    auto lower = path;
+    std::transform(lower.begin(), lower.end(), lower.begin(), [](unsigned char c){ return (char)std::tolower(c); });
+    return lower.rfind(".mp4")  != std::string::npos ||
+           lower.rfind(".mov")  != std::string::npos ||
+           lower.rfind(".mkv")  != std::string::npos ||
+           lower.rfind(".avi")  != std::string::npos ||
+           lower.rfind(".webm") != std::string::npos;
+}
+
+// untested
+static mtmd_bitmap* load_frames_from_dir(mtmd_context * ctx,
                           const std::string & dir_path,
-                          std::vector<mtmd::bitmap> & out_frames,
                           const LoadVideoOptions & opts) {
-    if (!ctx || dir_path.empty() || !is_dir(dir_path)) {
-        return false;
+    if (!ctx || dir_path.empty() || !is_dir(dir_path) || opts.max_frames < 1) {
+        return nullptr;
     }
     // note: hparam-based control is applied inside clip.cpp; nothing to set globally here
 
@@ -103,32 +113,35 @@ bool load_frames_from_dir(mtmd_context * ctx,
     list_files(dir_path, files, opts.recursive);
     std::sort(files.begin(), files.end());
 
-    int stride = std::max(1, opts.stride);
-    int loaded = 0;
+    auto stride = std::max(1u, opts.stride);
+    size_t loaded = 0;
+    unsigned char* dest = nullptr;
+    mtmd_bitmap* out_frames = nullptr;
+
+    uint32_t w=0, h=0;
     for (size_t i = 0; i < files.size(); i++) {
-        if ((int)i % stride != 0) continue;
+        if (i % stride != 0) continue;
         const std::string & f = files[i];
         if (!has_image_ext(f)) continue;
         mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx, f.c_str()));
         if (!bmp.ptr) continue;
-        out_frames.push_back(std::move(bmp));
+        if(loaded==0){
+            w = bmp.nx();
+            h = bmp.ny();
+            out_frames = mtmd_bitmap_init_from_video(w, h, loaded, nullptr);
+            dest = mtmd_bitmap_get_data_mutable(out_frames);
+        }else if(bmp.nx() != w || bmp.ny() != h){
+            return nullptr; // all frames must have the same size
+        }
+        std::memcpy(dest,
+                    bmp.data(),
+                    bmp.n_bytes());
+        dest += bmp.n_bytes();
         loaded++;
-        if (opts.max_frames > 0 && loaded >= opts.max_frames) break;
-    }
-    return loaded > 0;
-}
-
-size_t append_frames_from_dir(mtmd_context * ctx,
-                              const std::string & dir_path,
-                              mtmd::bitmaps & dst,
-                              const LoadVideoOptions & opts) {
-    std::vector<mtmd::bitmap> frames;
-    if (!load_frames_from_dir(ctx, dir_path, frames, opts)) {
-        return 0;
+        if (loaded >= opts.max_frames) break;
     }
-    size_t before = dst.entries.size();
-    for (auto & f : frames) dst.entries.push_back(std::move(f));
-    return dst.entries.size() - before;
+    
+    return out_frames;
 }
 
 // --- FFmpeg-based file decoding (optional) ---
@@ -244,22 +257,26 @@ static bool decode_video_ffmpeg_to_rgba(const std::string & file,
     return taken > 0;
 }
 
-bool load_frames_from_file(mtmd_context * ctx,
+static mtmd_bitmap* load_frames_from_file(mtmd_context * ctx,
                            const std::string & file_path,
-                           std::vector<mtmd::bitmap> & out_frames,
                            const LoadVideoOptions & opts) {
-    if (!ctx) return false;
+    if (!ctx) return nullptr;
     std::vector<DecodedFrameRGBA> decoded;
-    if (!decode_video_ffmpeg_to_rgba(file_path, decoded, opts.max_frames, std::max(1, opts.stride))) {
-        return false;
+    if (!decode_video_ffmpeg_to_rgba(file_path, decoded, opts.max_frames, std::max(1u, opts.stride))) {
+        return nullptr;
     }
+    const size_t nframes = decoded.size();
+    if(nframes < 1){
+        return nullptr;
+    }
+    const int w = decoded[0].width;
+    const int h = decoded[0].height;
+    mtmd_bitmap* out_frames = mtmd_bitmap_init_from_video(uint32_t(w), uint32_t(h), uint32_t(nframes), nullptr);
+    unsigned char * dst = mtmd_bitmap_get_data_mutable(out_frames);
+
     for (auto & fr : decoded) {
-        const int w = fr.width;
-        const int h = fr.height;
-        std::vector<unsigned char> rgb;
-        rgb.resize((size_t)w * h * 3);
+        GGML_ASSERT(w == fr.width && h == fr.height);
         const unsigned char * src = fr.rgba.data();
-        unsigned char * dst = rgb.data();
         for (int i = 0; i < w * h; ++i) {
             dst[0] = src[0]; // R
             dst[1] = src[1]; // G
@@ -267,26 +284,24 @@ bool load_frames_from_file(mtmd_context * ctx,
             dst += 3;
             src += 4; // skip A
         }
-        mtmd::bitmap bmp(mtmd_bitmap_init((uint32_t)w, (uint32_t)h, rgb.data()));
-        if (bmp.ptr) out_frames.push_back(std::move(bmp));
     }
-    return !out_frames.empty();
+
+    return out_frames;
 }
 #else
-bool load_frames_from_file(mtmd_context * /*ctx*/,
+static mtmd_bitmap* load_frames_from_file(mtmd_context * /*ctx*/,
                            const std::string & /*file_path*/,
-                           std::vector<mtmd::bitmap> & /*out_frames*/,
                            const LoadVideoOptions & /*opts*/) {
-    return false;
+    return nullptr;
 }
 bool get_video_info_ffmpeg(const std::string &file, VideoInfo &info) {
+    LOG_ERR("FFmpeg support is not enabled in this build\n");
     return false;
 }
 #endif
 
-size_t append_frames_from_path(mtmd_context * ctx,
-                               const std::string & path,
-                               mtmd::bitmaps & dst) {
+mtmd_bitmap* init_video_bitmap_from_path(mtmd_context * ctx,
+                               const std::string & path) {
     mtmd_video::LoadVideoOptions opts;
     opts.max_frames = 32;
     opts.stride     = 1;
@@ -300,7 +315,7 @@ size_t append_frames_from_path(mtmd_context * ctx,
         info.total_frames = files.size();
     } else {
         if(!mtmd_video::get_video_info_ffmpeg(path, info)) {
-            return 0;
+            return nullptr;
         }
     }
 
@@ -316,14 +331,10 @@ size_t append_frames_from_path(mtmd_context * ctx,
     }
 
     if (is_dir(path)) {
-        return append_frames_from_dir(ctx, path, dst, opts);
-    } else {
-        std::vector<mtmd::bitmap> frames;
-        if (!load_frames_from_file(ctx, path, frames, opts)) return 0;
-        size_t before = dst.entries.size();
-        for (auto & f : frames) dst.entries.push_back(std::move(f));
-        return dst.entries.size() - before;
+        return load_frames_from_dir(ctx, path, opts);
     }
+
+    return load_frames_from_file(ctx, path, opts);
 }
 
 } // namespace mtmd_video
diff --git a/tools/mtmd/mtmd-video.h b/tools/mtmd/mtmd-video.h
index 26055511f27b1..479259147bed1 100644
--- a/tools/mtmd/mtmd-video.h
+++ b/tools/mtmd/mtmd-video.h
@@ -7,14 +7,16 @@
 
 #include "mtmd.h"
 
+#include <cstddef>
+#include <cstdint>
 #include <string>
 #include <vector>
 
 namespace mtmd_video {
 
 struct LoadVideoOptions {
-    int max_frames = 32;   // maximum frames to load (<= 0 means no limit)
-    int stride     = 1;    // take every N-th frame
+    uint32_t max_frames = 32;   // maximum frames to load (<= 0 means no limit)
+    uint32_t stride     = 1;    // take every N-th frame
     bool recursive = false;// scan subdirectories
 };
 
@@ -23,36 +25,17 @@ struct VideoInfo {
     int64_t total_frames = 0;   // total number of frames
 };
 
-// Load frames from a directory of images (jpg/png/bmp/webp etc.)
-// Returns true on success (>=1 frame loaded), false otherwise.
-bool load_frames_from_dir(mtmd_context * ctx,
-                          const std::string & dir_path,
-                          std::vector<mtmd::bitmap> & out_frames,
-                          const LoadVideoOptions & opts = {});
-
-// Append frames loaded from a directory into mtmd::bitmaps container.
-// Returns number of frames appended (0 on failure).
-size_t append_frames_from_dir(mtmd_context * ctx,
-                              const std::string & dir_path,
-                              mtmd::bitmaps & dst,
-                              const LoadVideoOptions & opts = {});
+// Check if a path is a video file based on its extension
+bool is_video_file(const std::string & path);
 
 // get video info (fps, total frames) via FFmpeg (mp4/mov/mkv/avi/webm...).
 // Returns true on success, false otherwise.
 bool get_video_info_ffmpeg(const std::string &file, VideoInfo &info);
 
-// Load frames from a video file via FFmpeg (mp4/mov/mkv/avi/webm...).
-// Returns true on success (>=1 frame loaded), false otherwise.
-bool load_frames_from_file(mtmd_context * ctx,
-                           const std::string & file_path,
-                           std::vector<mtmd::bitmap> & out_frames,
-                           const LoadVideoOptions & opts = {});
-
 // Append frames loaded from a file or directory (auto-detect).
-// Returns number of frames appended (0 on failure or unsupported input).
-size_t append_frames_from_path(mtmd_context * ctx,
-                               const std::string & path,
-                               mtmd::bitmaps & dst);
+// Returns a mtmd_bitmap containing all frames in RGB format.
+mtmd_bitmap* init_video_bitmap_from_path(mtmd_context * ctx,
+                               const std::string & path);
 
 } // namespace mtmd_video
 
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index 6a186832742e5..c09e8276c8576 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -7,6 +7,7 @@
 
 #include <algorithm>
 #include <cerrno>
+#include <cstdint>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
@@ -21,6 +22,9 @@ struct mtmd_bitmap {
     std::vector<unsigned char> data;
     std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
     bool is_audio = false; // true if the bitmap is audio
+
+    bool is_video = false; // true if the bitmap is video
+    uint32_t nz; // for video: number of frames
 };
 
 struct mtmd_image_tokens {
@@ -79,6 +83,13 @@ enum mtmd_slice_tmpl {
     // TODO @ngxson : add support for idefics (SmolVLM)
 };
 
+// video schema, used to tokenize video frames 
+enum mtmd_video_schema {
+    MTMD_VIDEO_SCHEMA_NONE,
+    MTMD_VIDEO_SCHEMA_MINICPMV_2_6,
+    MTMD_VIDEO_SCHEMA_MINICPMV_4_5,
+};
+
 const char * mtmd_default_marker() {
     return "<__media__>";
 }
@@ -114,6 +125,7 @@ struct mtmd_context {
     // for llava-uhd style models, we need special tokens in-between slices
     // minicpmv calls them "slices", llama 4 calls them "tiles"
     mtmd_slice_tmpl slice_tmpl    = MTMD_SLICE_TMPL_NONE;
+    mtmd_video_schema video_sch   = MTMD_VIDEO_SCHEMA_NONE;
     llama_token tok_ov_img_start  = LLAMA_TOKEN_NULL; // overview image
     llama_token tok_ov_img_end    = LLAMA_TOKEN_NULL; // overview image
     llama_token tok_slices_start  = LLAMA_TOKEN_NULL; // start of all slices
@@ -211,6 +223,7 @@ struct mtmd_context {
             // minicpmv 2.6 format:
             // <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
             slice_tmpl        = MTMD_SLICE_TMPL_MINICPMV_2_6;
+            video_sch         = MTMD_VIDEO_SCHEMA_MINICPMV_2_6;
             tok_ov_img_start  = lookup_token("<image>");
             tok_ov_img_end    = lookup_token("</image>");
             tok_sli_img_start = lookup_token("<slice>");
@@ -471,61 +484,64 @@ struct mtmd_tokenizer {
         }
     }
 
-    int32_t add_media(const mtmd_bitmap * bitmap) {
-        if (!bitmap->is_audio) {
-            // handle image
-
-            if (!ctx->ctx_v) {
-                LOG_ERR("%s: error: model does not support vision input\n", __func__);
-                return 2;
-            }
-
-            if (!ctx->img_beg.empty()) {
-                add_text(ctx->img_beg, true); // add image begin token
-            }
+    int32_t add_image(const mtmd_bitmap * bitmap, bool have_slices=true) {
+        if(mtmd_bitmap_is_audio(bitmap) || mtmd_bitmap_is_video(bitmap)){
+            LOG_ERR("%s: error: bitmap is not an image\n", __func__);
+            return 2;
+        }
+        // handle image
+        if (!ctx->ctx_v) {
+            LOG_ERR("%s: error: model does not support vision input\n", __func__);
+            return 2;
+        }
 
-            // convert mtmd_bitmap to clip_image_u8
-            clip_image_u8_ptr img_u8(clip_image_u8_init());
-            img_u8->nx = bitmap->nx;
-            img_u8->ny = bitmap->ny;
-            img_u8->buf.resize(bitmap->data.size());
-            std::memcpy(img_u8->buf.data(), bitmap->data.data(), img_u8->nx * img_u8->ny * 3);
+        if (!ctx->img_beg.empty()) {
+            add_text(ctx->img_beg, true); // add image begin token
+        }
 
-            // preprocess image
-            clip_image_f32_batch batch_f32;
-            bool ok = clip_image_preprocess(ctx->ctx_v, img_u8.get(), &batch_f32);
-            if (!ok) {
-                LOG_ERR("Unable to preprocess image\n");
-                return 2;
-            }
+        // convert mtmd_bitmap to clip_image_u8
+        clip_image_u8_ptr img_u8(clip_image_u8_init());
+        img_u8->nx = bitmap->nx;
+        img_u8->ny = bitmap->ny;
+        img_u8->buf.resize(bitmap->data.size());
+        std::memcpy(img_u8->buf.data(), bitmap->data.data(), img_u8->nx * img_u8->ny * 3);
+
+        // preprocess image
+        clip_image_f32_batch batch_f32;
+        bool ok = clip_image_preprocess(ctx->ctx_v, img_u8.get(), &batch_f32);
+        if (!ok) {
+            LOG_ERR("Unable to preprocess image\n");
+            return 2;
+        }
 
-            // handle llava-uhd style preprocessing
-            if (
-                ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
-                || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
-                || ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
-            ) {
-                const int n_col = batch_f32.grid_x;
-                const int n_row = batch_f32.grid_y;
-                // split batch into chunks of single images
-                // NOTE: batch_f32 will be invalidated after this call
-                auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmap->id);
-                GGML_ASSERT(chunks.size() > 0);
-
-                auto ov_chunk = std::move(chunks.front());
-                chunks.erase(chunks.begin());
-
-                // add overview image (first)
-                if (ctx->ov_img_first) {
-                    if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
-                        add_text({ctx->tok_ov_img_start});
-                    }
-                    cur.entries.emplace_back(std::move(ov_chunk));
-                    if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
-                        add_text({ctx->tok_ov_img_end});
-                    }
+        // handle llava-uhd style preprocessing
+        if (
+            ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
+            || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
+            || ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
+        ) {
+            const int n_col = batch_f32.grid_x;
+            const int n_row = batch_f32.grid_y;
+            // split batch into chunks of single images
+            // NOTE: batch_f32 will be invalidated after this call
+            auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmap->id);
+            GGML_ASSERT(chunks.size() > 0);
+
+            auto ov_chunk = std::move(chunks.front());
+            chunks.erase(chunks.begin());
+
+            // add overview image (first)
+            if (ctx->ov_img_first) {
+                if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
+                    add_text({ctx->tok_ov_img_start});
                 }
-
+                cur.entries.emplace_back(std::move(ov_chunk));
+                if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
+                    add_text({ctx->tok_ov_img_end});
+                }
+            }
+            
+            if(have_slices){
                 // add slices (or tiles)
                 if (!chunks.empty()) {
                     GGML_ASSERT((int)chunks.size() == n_row * n_col);
@@ -554,56 +570,60 @@ struct mtmd_tokenizer {
                         add_text({ctx->tok_slices_end});
                     }
                 }
+            }
 
-                // add overview image (last)
-                if (!ctx->ov_img_first) {
-                    if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
-                        add_text({ctx->tok_ov_img_start});
-                    }
-                    cur.entries.emplace_back(std::move(ov_chunk));
-                    if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
-                        add_text({ctx->tok_ov_img_end});
-                    }
-                }
-
-            } else {
-                size_t n_tokens = 0;
-                for (const auto & entry : batch_f32.entries) {
-                    n_tokens += clip_n_output_tokens(ctx->ctx_v, entry.get());
+            // add overview image (last)
+            if (!ctx->ov_img_first) {
+                if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
+                    add_text({ctx->tok_ov_img_start});
                 }
-
-                mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
-                if (ctx->use_mrope) {
-                    // for Qwen2VL, we need this information for M-RoPE decoding positions
-                    image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get());
-                    image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_v, batch_f32.entries[0].get());
-                    image_tokens->use_mrope_pos = true;
-                } else {
-                    // other models, we only need the total number of tokens
-                    image_tokens->nx = n_tokens;
-                    image_tokens->ny = 1;
+                cur.entries.emplace_back(std::move(ov_chunk));
+                if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
+                    add_text({ctx->tok_ov_img_end});
                 }
-                image_tokens->batch_f32 = std::move(batch_f32);
-                image_tokens->id = bitmap->id; // optional
-
-                LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
-                LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
-                LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size());
+            }
 
-                mtmd_input_chunk chunk{
-                    MTMD_INPUT_CHUNK_TYPE_IMAGE,
-                    {}, // text tokens
-                    std::move(image_tokens),
-                    nullptr, // audio tokens
-                };
-                cur.entries.emplace_back(std::move(chunk));
+        } else {
+            size_t n_tokens = 0;
+            for (const auto & entry : batch_f32.entries) {
+                n_tokens += clip_n_output_tokens(ctx->ctx_v, entry.get());
             }
 
-            if (!ctx->img_end.empty()) {
-                add_text(ctx->img_end, true); // add image end token
+            mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
+            if (ctx->use_mrope) {
+                // for Qwen2VL, we need this information for M-RoPE decoding positions
+                image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get());
+                image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_v, batch_f32.entries[0].get());
+                image_tokens->use_mrope_pos = true;
+            } else {
+                // other models, we only need the total number of tokens
+                image_tokens->nx = n_tokens;
+                image_tokens->ny = 1;
             }
+            image_tokens->batch_f32 = std::move(batch_f32);
+            image_tokens->id = bitmap->id; // optional
 
-        } else {
+            LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
+            LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
+            LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size());
+
+            mtmd_input_chunk chunk{
+                MTMD_INPUT_CHUNK_TYPE_IMAGE,
+                {}, // text tokens
+                std::move(image_tokens),
+                nullptr, // audio tokens
+            };
+            cur.entries.emplace_back(std::move(chunk));
+        }
+
+        if (!ctx->img_end.empty()) {
+            add_text(ctx->img_end, true); // add image end token
+        }
+        return 0;
+    }
+
+    int32_t add_media(const mtmd_bitmap * bitmap) {
+        if(mtmd_bitmap_is_audio(bitmap)){
             // handle audio
 
             if (!ctx->ctx_a) {
@@ -663,9 +683,39 @@ struct mtmd_tokenizer {
             if (!ctx->aud_end.empty()) {
                 add_text(ctx->aud_end, true); // add audio end token
             }
+
+            return 0;
         }
 
-        return 0;
+        if(mtmd_bitmap_is_video(bitmap)){
+            // handle video
+            if (!ctx->ctx_v) {
+                LOG_ERR("%s: error: model does not support vision input\n", __func__);
+                return 2;
+            }
+
+            if (
+                ctx->video_sch == MTMD_VIDEO_SCHEMA_MINICPMV_4_5
+            ) {
+                // TODO
+                LOG_ERR("%s: error: video schema MINICPMV_4_5 is not implemented yet\n", __func__);
+                return 2;
+            }else{
+                const auto frame_size = bitmap->nx * bitmap->ny * 3;
+                for(uint32_t f = 0; f < bitmap->nz; f++){
+                    // extract frame f
+                    mtmd_bitmap* frame_bitmap = mtmd_bitmap_init(bitmap->nx, bitmap->ny, bitmap->data.data() + f * frame_size);
+                    // minicpmv 2.6 video understanding: disable UHD slicing (overview only)
+                    int32_t res = add_image(frame_bitmap, false /* have_slices */);
+                    mtmd_bitmap_free(frame_bitmap);
+                    if (res != 0) return res;
+                }
+            }
+            return 0;
+        }
+
+        // handle image otherwise
+        return add_image(bitmap);
     }
 
     std::vector<mtmd_input_chunk> split_batch_to_chunk(clip_image_f32_batch && batch_f32, const std::string & id) {
@@ -850,6 +900,9 @@ mtmd_bitmap * mtmd_bitmap_init(uint32_t nx,
     mtmd_bitmap * bitmap = new mtmd_bitmap;
     bitmap->nx = nx;
     bitmap->ny = ny;
+    bitmap->nz = 1;
+    bitmap->is_audio = false;
+    bitmap->is_video = false;
     size_t data_size = (size_t)nx * ny * 3;
     bitmap->data.resize(data_size);
     std::memcpy(bitmap->data.data(), data, data_size);
@@ -861,13 +914,31 @@ mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples,
     mtmd_bitmap * bitmap = new mtmd_bitmap;
     bitmap->nx = n_samples;
     bitmap->ny = 1;
+    bitmap->nz = 1;
     bitmap->is_audio = true;
+    bitmap->is_video = false;
     size_t data_size = n_samples * sizeof(float);
     bitmap->data.resize(data_size);
     std::memcpy(bitmap->data.data(), data, data_size);
     return bitmap;
 }
 
+mtmd_bitmap * mtmd_bitmap_init_from_video(uint32_t nx, uint32_t ny, uint32_t nframes, const unsigned char * data){
+    mtmd_bitmap * bitmap = new mtmd_bitmap;
+    bitmap->nx = nx;
+    bitmap->ny = ny;
+    bitmap->nz = nframes;
+    bitmap->is_audio = false;
+    bitmap->is_video = true;
+    size_t data_size = (size_t)nx * ny * nframes * 3;
+    bitmap->data.resize(data_size);
+    if(data != nullptr){
+        std::memcpy(bitmap->data.data(), data, data_size);
+    }
+    return bitmap;
+}
+
+
 uint32_t mtmd_bitmap_get_nx(const mtmd_bitmap * bitmap) {
     return bitmap->nx;
 }
@@ -880,6 +951,10 @@ const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap) {
     return bitmap->data.data();
 }
 
+unsigned char * mtmd_bitmap_get_data_mutable(mtmd_bitmap * bitmap) {
+    return bitmap->data.data();
+}
+
 size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap) {
     return bitmap->data.size();
 }
@@ -888,6 +963,10 @@ bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap) {
     return bitmap->is_audio;
 }
 
+bool mtmd_bitmap_is_video(const mtmd_bitmap * bitmap) {
+    return bitmap->is_video;
+}
+
 const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap) {
     return bitmap->id.c_str();
 }
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
index daf675e50f0b6..a67d3502a597d 100644
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@@ -124,13 +124,19 @@ MTMD_API void mtmd_set_minicpmv_max_slice_nums(mtmd_context * ctx, int n);
 // if bitmap is audio:
 //     length of data must be n_samples * sizeof(float)
 //     the data is in float format (PCM F32)
+// if bitmap is video:
+//     length of data must be nx * ny * nframes * 3
+//     the data is in RGBRGBRGB... format for each frame, frames are stored sequentially
 MTMD_API mtmd_bitmap *         mtmd_bitmap_init           (uint32_t nx, uint32_t ny, const unsigned char * data);
 MTMD_API mtmd_bitmap *         mtmd_bitmap_init_from_audio(size_t n_samples,         const float         * data);
+MTMD_API mtmd_bitmap *         mtmd_bitmap_init_from_video(uint32_t nx, uint32_t ny, uint32_t nframes, const unsigned char * data);
 MTMD_API uint32_t              mtmd_bitmap_get_nx     (const mtmd_bitmap * bitmap);
 MTMD_API uint32_t              mtmd_bitmap_get_ny     (const mtmd_bitmap * bitmap);
 MTMD_API const unsigned char * mtmd_bitmap_get_data   (const mtmd_bitmap * bitmap);
+MTMD_API unsigned char * mtmd_bitmap_get_data_mutable (mtmd_bitmap * bitmap);
 MTMD_API size_t                mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap);
 MTMD_API bool                  mtmd_bitmap_is_audio   (const mtmd_bitmap * bitmap);
+MTMD_API bool                  mtmd_bitmap_is_video   (const mtmd_bitmap * bitmap);
 MTMD_API void                  mtmd_bitmap_free       (mtmd_bitmap * bitmap);
 // bitmap ID is optional, but useful for KV cache tracking
 // these getters/setters are dedicated functions, so you can for example calculate the hash of the image based on mtmd_bitmap_get_data()

From 2c1d02aa595d95e873b7621f2dfac4377d11f907 Mon Sep 17 00:00:00 2001
From: qijia <2421653893@qq.com>
Date: Fri, 31 Oct 2025 10:25:09 +0800
Subject: [PATCH 05/13] refactor: add bitmap_type, instead of is_audio,
 is_video...

---
 tools/mtmd/mtmd-video.cpp |  2 +-
 tools/mtmd/mtmd.cpp       | 38 +++++++++++++++++++++++---------------
 2 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/tools/mtmd/mtmd-video.cpp b/tools/mtmd/mtmd-video.cpp
index e9d6aea366653..d0e547fed880a 100644
--- a/tools/mtmd/mtmd-video.cpp
+++ b/tools/mtmd/mtmd-video.cpp
@@ -319,7 +319,7 @@ mtmd_bitmap* init_video_bitmap_from_path(mtmd_context * ctx,
         }
     }
 
-    // minicpm normal speed
+    // minicpm frames sample method
     const int32_t minicpmv_max_video_frames = 64;
     opts.max_frames = minicpmv_max_video_frames;
     if(info.total_frames > minicpmv_max_video_frames) {
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index c09e8276c8576..615b3faf165eb 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -14,17 +14,28 @@
 #include <limits>
 #include <vector>
 
-// represents raw image data, layout is RGBRGBRGB...
-// length of data must be nx * ny * 3
+enum class mtmd_bitmap_type {
+    IMAGE,
+    AUDIO,
+    VIDEO,
+}; 
+
+// if bitmap is image:
+//     length of data must be nx * ny * 3
+//     the data is in RGBRGBRGB... format
+// if bitmap is audio:
+//     length of data must be n_samples * sizeof(float)
+//     the data is in float format (PCM F32)
+// if bitmap is video:
+//     length of data must be nx * ny * nframes * 3
+//     the data is in RGBRGBRGB... format for each frame, frames are stored sequentially
 struct mtmd_bitmap {
+    mtmd_bitmap_type type;
     uint32_t nx;
-    uint32_t ny;
+    uint32_t ny; // for audio, ny=1
+    uint32_t nz; // for video: number of frames
     std::vector<unsigned char> data;
     std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
-    bool is_audio = false; // true if the bitmap is audio
-
-    bool is_video = false; // true if the bitmap is video
-    uint32_t nz; // for video: number of frames
 };
 
 struct mtmd_image_tokens {
@@ -901,8 +912,7 @@ mtmd_bitmap * mtmd_bitmap_init(uint32_t nx,
     bitmap->nx = nx;
     bitmap->ny = ny;
     bitmap->nz = 1;
-    bitmap->is_audio = false;
-    bitmap->is_video = false;
+    bitmap->type = mtmd_bitmap_type::IMAGE;
     size_t data_size = (size_t)nx * ny * 3;
     bitmap->data.resize(data_size);
     std::memcpy(bitmap->data.data(), data, data_size);
@@ -915,8 +925,7 @@ mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples,
     bitmap->nx = n_samples;
     bitmap->ny = 1;
     bitmap->nz = 1;
-    bitmap->is_audio = true;
-    bitmap->is_video = false;
+    bitmap->type = mtmd_bitmap_type::AUDIO;
     size_t data_size = n_samples * sizeof(float);
     bitmap->data.resize(data_size);
     std::memcpy(bitmap->data.data(), data, data_size);
@@ -928,8 +937,7 @@ mtmd_bitmap * mtmd_bitmap_init_from_video(uint32_t nx, uint32_t ny, uint32_t nfr
     bitmap->nx = nx;
     bitmap->ny = ny;
     bitmap->nz = nframes;
-    bitmap->is_audio = false;
-    bitmap->is_video = true;
+    bitmap->type = mtmd_bitmap_type::VIDEO;
     size_t data_size = (size_t)nx * ny * nframes * 3;
     bitmap->data.resize(data_size);
     if(data != nullptr){
@@ -960,11 +968,11 @@ size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap) {
 }
 
 bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap) {
-    return bitmap->is_audio;
+    return bitmap->type == mtmd_bitmap_type::AUDIO;
 }
 
 bool mtmd_bitmap_is_video(const mtmd_bitmap * bitmap) {
-    return bitmap->is_video;
+    return bitmap->type == mtmd_bitmap_type::VIDEO;
 }
 
 const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap) {

From 816b92b37a2342fcd33abd015d32187f166c285a Mon Sep 17 00:00:00 2001
From: qijia <2421653893@qq.com>
Date: Fri, 31 Oct 2025 15:51:54 +0800
Subject: [PATCH 06/13] feat: support loading video from buffer

---
 tools/mtmd/mtmd-helper.cpp |  17 +-
 tools/mtmd/mtmd-video.cpp  | 345 +++++++++++++++++++++++++++++++++----
 tools/mtmd/mtmd-video.h    |  12 +-
 3 files changed, 329 insertions(+), 45 deletions(-)

diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp
index 46c85ecd3614d..6cbaa7ac2d1cc 100644
--- a/tools/mtmd/mtmd-helper.cpp
+++ b/tools/mtmd/mtmd-helper.cpp
@@ -423,6 +423,10 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigne
         return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data());
     }
 
+    if(mtmd_video::is_video_buffer(buf, len)) {
+        return mtmd_video::init_video_bitmap(ctx, buf, len);
+    }
+
     // otherwise, we assume it's an image
     mtmd_bitmap * result = nullptr;
     {
@@ -439,11 +443,12 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigne
 }
 
 mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname) {
+    // although we could read the file into memory and call mtmd_helper_bitmap_init_from_buf,
+    // but for video files, it's better to let ffmpeg read from file
     if(mtmd_video::is_video_file(fname)){
-        return mtmd_video::init_video_bitmap_from_path(ctx, fname);
+        return mtmd_video::init_video_bitmap(ctx, fname);
     }
 
-    std::vector<unsigned char> buf;
     FILE * f = fopen(fname, "rb");
     if (!f) {
         LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
@@ -453,14 +458,16 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char *
     fseek(f, 0, SEEK_END);
     long file_size = ftell(f);
     fseek(f, 0, SEEK_SET);
-    buf.resize(file_size);
+    auto * buf = new unsigned char[file_size]; 
 
-    size_t n_read = fread(buf.data(), 1, file_size, f);
+    size_t n_read = fread(buf, 1, file_size, f);
     fclose(f);
     if (n_read != (size_t)file_size) {
         LOG_ERR("Failed to read entire file %s", fname);
         return nullptr;
     }
 
-    return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size());
+    auto * res = mtmd_helper_bitmap_init_from_buf(ctx, buf, file_size);
+    delete [] buf;
+    return res;
 }
diff --git a/tools/mtmd/mtmd-video.cpp b/tools/mtmd/mtmd-video.cpp
index d0e547fed880a..60acee4a6eaeb 100644
--- a/tools/mtmd/mtmd-video.cpp
+++ b/tools/mtmd/mtmd-video.cpp
@@ -1,6 +1,6 @@
 #include "mtmd-video.h"
+#include "clip-impl.h"
 #include "mtmd-helper.h"
-#include "clip.h"
 
 #include <algorithm>
 #include <string>
@@ -156,21 +156,193 @@ extern "C" {
 }
 #pragma GCC diagnostic pop
 
+bool is_video_buffer(const uint8_t *data, size_t size){
+    if (!data || size < 16) return false; // too short
+
+    AVProbeData probe;
+    probe.buf = const_cast<uint8_t*>(data);
+    probe.buf_size = (int)size;
+    probe.filename = "";
+
+    // ffmpeg requires that the last AVPROBE_PADDING_SIZE bytes of the buffer must be 0
+    std::vector<uint8_t> padded(size + AVPROBE_PADDING_SIZE);
+    memcpy(padded.data(), data, size);
+    memset(padded.data() + size, 0, AVPROBE_PADDING_SIZE);
+    probe.buf = padded.data();
+    probe.buf_size = (int)size;
+
+    const AVInputFormat *fmt = av_probe_input_format(&probe, 1);
+    if (!fmt) return false;
+    if (fmt->flags & AVFMT_NOFILE) return false;
+
+    return true;
+}
+
 struct DecodedFrameRGBA {
     int width;
     int height;
     std::vector<unsigned char> rgba; // size = width * height * 4
 };
 
-bool get_video_info_ffmpeg(const std::string &file, VideoInfo &info) {
-    AVFormatContext *fmt = nullptr;
-    if (avformat_open_input(&fmt, file.c_str(), nullptr, nullptr) < 0) {
+struct BufferData {
+    const uint8_t* base;
+    size_t size;
+    size_t pos;
+    BufferData(const uint8_t* b, size_t s) : base(b), size(s), pos(0) {}
+};
+
+static int read_packet(void* opaque, uint8_t* buf, int buf_size) {
+    BufferData* bd = static_cast<BufferData*>(opaque);
+    if (!bd || !bd->base) return AVERROR(EIO);
+    if (bd->pos >= bd->size) return AVERROR_EOF;
+    size_t rem = bd->size - bd->pos;
+    int to_read = (int)(rem < (size_t)buf_size ? rem : (size_t)buf_size);
+    if (to_read == 0) return AVERROR_EOF;
+    memcpy(buf, bd->base + bd->pos, to_read);
+    bd->pos += to_read;
+    return to_read;
+}
+
+static int64_t seek_packet(void* opaque, int64_t offset, int whence) {
+    BufferData* bd = static_cast<BufferData*>(opaque);
+    if (!bd) return -1;
+    if (whence == AVSEEK_SIZE) return (int64_t)bd->size;
+    size_t newpos = bd->pos;
+    if (whence == SEEK_SET) {
+        if (offset < 0 || (size_t)offset > bd->size) return -1;
+        newpos = (size_t)offset;
+    } else if (whence == SEEK_CUR) {
+        if (offset < 0 && (size_t)(-offset) > bd->pos) return -1;
+        newpos = bd->pos + (size_t)offset;
+        if (newpos > bd->size) return -1;
+    } else if (whence == SEEK_END) {
+        if (offset > 0 || (size_t)(-offset) > bd->size) return -1;
+        newpos = bd->size + (size_t)offset;
+    } else return -1;
+    bd->pos = newpos;
+    return (int64_t)bd->pos;
+}
+
+static bool create_format_context_from_buffer(const uint8_t* buffer, size_t size,
+                                       AVFormatContext*& fmt,
+                                       AVIOContext*& avio_ctx,
+                                       uint8_t*& avio_ctx_buffer) {
+    fmt = nullptr;
+    avio_ctx = nullptr;
+    avio_ctx_buffer = nullptr;
+
+    if (!buffer || size == 0) return false;
+
+    // allocate BufferData
+    BufferData* bd = new (std::nothrow) BufferData(buffer, size);
+    if (!bd) return false;
+
+    const int AVIO_BUF_SIZE = 4096;
+    avio_ctx_buffer = static_cast<uint8_t*>(av_malloc(AVIO_BUF_SIZE));
+    if (!avio_ctx_buffer) {
+        delete bd;
+        return false;
+    }
+
+    avio_ctx = avio_alloc_context(
+        avio_ctx_buffer, AVIO_BUF_SIZE,
+        0, // read only
+        bd,
+        &read_packet,
+        nullptr,
+        &seek_packet
+    );
+
+    if (!avio_ctx) {
+        av_free(avio_ctx_buffer);
+        delete bd;
+        avio_ctx_buffer = nullptr;
+        return false;
+    }
+
+    fmt = avformat_alloc_context();
+    if (!fmt) {
+        // avio_context_free frees ctx->buffer but NOT opaque
+        if (avio_ctx->opaque) delete static_cast<BufferData*>(avio_ctx->opaque);
+        avio_context_free(&avio_ctx);
+        avio_ctx_buffer = nullptr;
+        return false;
+    }
+
+    fmt->pb = avio_ctx;
+    fmt->flags |= AVFMT_FLAG_CUSTOM_IO;
+
+    // increase probing - optional but helpful for truncated/streamed files
+    AVDictionary* opts = nullptr;
+    av_dict_set(&opts, "probesize", "5000000", 0);
+    av_dict_set(&opts, "analyzeduration", "5000000", 0);
+
+    int ret = avformat_open_input(&fmt, "stream", nullptr, &opts);
+    av_dict_free(&opts);
+
+    if (ret < 0) {
+        // Clean up carefully
+        // If fmt exists and has pb, free pb and opaque appropriately
+        if (fmt) {
+            AVIOContext* pb = fmt->pb;
+            BufferData* bd_from_fmt = pb ? static_cast<BufferData*>(pb->opaque) : nullptr;
+            avformat_free_context(fmt);
+            if (pb) {
+                if (bd_from_fmt) delete bd_from_fmt;
+                avio_context_free(&pb); // frees pb->buffer
+            }
+            fmt = nullptr;
+        } else {
+            // fmt null: free avio_ctx and opaque
+            if (avio_ctx) {
+                if (avio_ctx->opaque) delete static_cast<BufferData*>(avio_ctx->opaque);
+                avio_context_free(&avio_ctx);
+                avio_ctx = nullptr;
+            }
+        }
+        avio_ctx_buffer = nullptr;
         return false;
     }
 
-    std::unique_ptr<AVFormatContext, void(*)(AVFormatContext*)> fmt_guard(fmt, 
-        [](AVFormatContext *f){ if (f) {avformat_close_input(&f);} });
+    // success: avformat_open_input succeeded, fmt and pb are owned by caller,
+    // but opaque (BufferData) must be deleted by us later (avformat_close_input won't delete opaque).
+    return true;
+}
+
+static void free_format_context_from_buffer(AVFormatContext* fmt,
+                                     AVIOContext* avio_ctx) {
+    if (fmt) {
+        // capture pb->opaque BEFORE closing
+        AVIOContext* pb = fmt->pb;
+        BufferData* bd = nullptr;
+        if (pb) bd = static_cast<BufferData*>(pb->opaque);
+
+        // this closes fmt and frees pb (and pb->buffer)
+        avformat_close_input(&fmt);
+
+        // avformat_close_input does not free opaque, so free it now
+        if (bd) {
+            delete bd;
+            bd = nullptr;
+        }
+        // do NOT av_free(avio_ctx_buffer) here - it was freed with pb->buffer
+        return;
+    }
+
+    // partial failure case: fmt is null but avio_ctx may still be valid
+    if (avio_ctx) {
+        BufferData* bd = static_cast<BufferData*>(avio_ctx->opaque);
+        if (bd) delete bd;
+        avio_context_free(&avio_ctx); // frees avio_ctx->buffer
+        // avio_ctx_buffer already freed by avio_context_free
+        return;
+    }
+}
+
 
+static bool get_video_info_from_format_ctx(AVFormatContext *fmt, VideoInfo &info) {
+    if (!fmt) return false;
+    
     if (avformat_find_stream_info(fmt, nullptr) < 0) {
         return false;
     }
@@ -200,14 +372,45 @@ bool get_video_info_ffmpeg(const std::string &file, VideoInfo &info) {
     return true;
 }
 
-static bool decode_video_ffmpeg_to_rgba(const std::string & file,
-                                        std::vector<DecodedFrameRGBA> & frames,
-                                        int max_frames,
-                                        int stride) {
-    if(stride <= 0 || max_frames <= 0) return false;
-    AVFormatContext * fmt = nullptr;
-    if (avformat_open_input(&fmt, file.c_str(), nullptr, nullptr) < 0) return false;
-    std::unique_ptr<AVFormatContext, void(*)(AVFormatContext*)> fmt_guard(fmt, [](AVFormatContext *f){ if (f) avformat_close_input(&f); });
+// from buffer
+bool get_video_info(const uint8_t* buffer, size_t size, VideoInfo &info) {
+    AVFormatContext* fmt = nullptr;
+    AVIOContext* avio_ctx = nullptr;
+    uint8_t* avio_ctx_buffer = nullptr;
+
+    GGML_ASSERT(create_format_context_from_buffer(buffer, size, fmt, avio_ctx, avio_ctx_buffer));
+    bool ok = get_video_info_from_format_ctx(fmt, info);
+    free_format_context_from_buffer(fmt, avio_ctx);
+    return ok;
+}
+
+// from file
+bool get_video_info(const std::string &path, VideoInfo &info) {
+    if(is_dir(path)) {
+        info.fps = 1; // do not care
+        std::vector<std::string> files;
+        list_files(path, files, true); // recursive
+        info.total_frames = files.size();
+        return true;
+    }
+    AVFormatContext* fmt = nullptr;
+    if (avformat_open_input(&fmt, path.c_str(), nullptr, nullptr) < 0)
+        return false;
+
+    std::unique_ptr<AVFormatContext, void(*)(AVFormatContext*)> fmt_guard(fmt, [](AVFormatContext* f){
+        if (f) avformat_close_input(&f);
+    });
+
+    return get_video_info_from_format_ctx(fmt, info);
+}
+
+static bool decode_video_ffmpeg_to_rgba_from_format_ctx(
+    AVFormatContext* fmt,
+    std::vector<DecodedFrameRGBA>& frames,
+    int max_frames,
+    int stride) 
+{
+    if(!fmt || stride <= 0 || max_frames <= 0) return false;
     if (avformat_find_stream_info(fmt, nullptr) < 0) return false;
     int vstream = av_find_best_stream(fmt, AVMEDIA_TYPE_VIDEO, -1, -1, nullptr, 0);
     if (vstream < 0) return false;
@@ -257,14 +460,48 @@ static bool decode_video_ffmpeg_to_rgba(const std::string & file,
     return taken > 0;
 }
 
-static mtmd_bitmap* load_frames_from_file(mtmd_context * ctx,
-                           const std::string & file_path,
-                           const LoadVideoOptions & opts) {
+// from file
+static bool decode_video_ffmpeg_to_rgba(
+    const std::string& file,
+    std::vector<DecodedFrameRGBA>& frames,
+    int max_frames,
+    int stride)
+{
+    AVFormatContext* fmt = nullptr;
+    if (avformat_open_input(&fmt, file.c_str(), nullptr, nullptr) < 0)
+        return false;
+
+    std::unique_ptr<AVFormatContext, void(*)(AVFormatContext*)> fmt_guard(fmt, [](AVFormatContext* f){
+        if (f) avformat_close_input(&f);
+    });
+
+    return decode_video_ffmpeg_to_rgba_from_format_ctx(fmt, frames, max_frames, stride);
+}
+
+// from buffer
+static bool decode_video_ffmpeg_to_rgba(
+    const uint8_t* buffer,
+    size_t size,
+    std::vector<DecodedFrameRGBA>& frames,
+    int max_frames,
+    int stride)
+{
+    if (!buffer || size == 0) return false;
+    AVFormatContext* fmt = nullptr;
+    AVIOContext* avio_ctx = nullptr;
+    uint8_t* avio_ctx_buffer = nullptr;
+
+    GGML_ASSERT(create_format_context_from_buffer(buffer, size, fmt, avio_ctx, avio_ctx_buffer));
+    
+    bool ok = decode_video_ffmpeg_to_rgba_from_format_ctx(fmt, frames, max_frames, stride);
+
+    free_format_context_from_buffer(fmt, avio_ctx);
+    return ok;
+}
+
+static mtmd_bitmap* convert_frames_to_bitmap(mtmd_context * ctx, const std::vector<DecodedFrameRGBA>& decoded) {
     if (!ctx) return nullptr;
-    std::vector<DecodedFrameRGBA> decoded;
-    if (!decode_video_ffmpeg_to_rgba(file_path, decoded, opts.max_frames, std::max(1u, opts.stride))) {
-        return nullptr;
-    }
+    if(decoded.empty()) return nullptr;
     const size_t nframes = decoded.size();
     if(nframes < 1){
         return nullptr;
@@ -294,33 +531,28 @@ static mtmd_bitmap* load_frames_from_file(mtmd_context * /*ctx*/,
                            const LoadVideoOptions & /*opts*/) {
     return nullptr;
 }
-bool get_video_info_ffmpeg(const std::string &file, VideoInfo &info) {
+bool get_video_info(const std::string &path, VideoInfo &info){
+    LOG_ERR("FFmpeg support is not enabled in this build\n");
+    return false;
+}
+bool get_video_info(const uint8_t* buffer, size_t size, VideoInfo &info){
+    LOG_ERR("FFmpeg support is not enabled in this build\n");
+    return false;
+}
+bool is_video_buffer(const uint8_t *data, size_t size){
     LOG_ERR("FFmpeg support is not enabled in this build\n");
     return false;
 }
 #endif
 
-mtmd_bitmap* init_video_bitmap_from_path(mtmd_context * ctx,
-                               const std::string & path) {
+static mtmd_video::LoadVideoOptions get_video_sample_options(mtmd_video::VideoInfo info){
     mtmd_video::LoadVideoOptions opts;
     opts.max_frames = 32;
     opts.stride     = 1;
     opts.recursive  = false;
 
-    auto info = mtmd_video::VideoInfo{};
-    if(is_dir(path)) {
-        info.fps = 1;
-        std::vector<std::string> files;
-        list_files(path, files, opts.recursive);
-        info.total_frames = files.size();
-    } else {
-        if(!mtmd_video::get_video_info_ffmpeg(path, info)) {
-            return nullptr;
-        }
-    }
-
     // minicpm frames sample method
-    const int32_t minicpmv_max_video_frames = 64;
+    const int32_t minicpmv_max_video_frames = 4;
     opts.max_frames = minicpmv_max_video_frames;
     if(info.total_frames > minicpmv_max_video_frames) {
         // uniform sample
@@ -329,12 +561,49 @@ mtmd_bitmap* init_video_bitmap_from_path(mtmd_context * ctx,
         // 1 frame per second
         opts.stride = (info.fps > 1.0) ? (int)std::ceil(info.fps) : 1;
     }
+    return opts;
+}
+
+mtmd_bitmap* init_video_bitmap(mtmd_context * ctx, const std::string & path) {
+    auto info = mtmd_video::VideoInfo{};
+    if(!mtmd_video::get_video_info(path, info)) {
+        LOG_ERR("Unable to get video info from path: %s\n", path.c_str());
+        return nullptr;
+    }
+
+    const auto opts = get_video_sample_options(info);
 
     if (is_dir(path)) {
         return load_frames_from_dir(ctx, path, opts);
     }
 
-    return load_frames_from_file(ctx, path, opts);
+    std::vector<DecodedFrameRGBA> frames;
+    if(!decode_video_ffmpeg_to_rgba(path, frames, opts.max_frames, std::max(1u, opts.stride))){
+        LOG_ERR("Unable to decode video from path: %s\n", path.c_str());
+        return nullptr;
+    }
+
+    return convert_frames_to_bitmap(ctx, frames);
+}
+
+mtmd_bitmap* init_video_bitmap(mtmd_context * ctx, const uint8_t* buffer, size_t size){
+    auto info = mtmd_video::VideoInfo{};
+    if(!mtmd_video::get_video_info(buffer, size, info)) {
+        LOG_ERR("Unable to get video info from buffer\n");
+        return nullptr;
+    }
+    printf("get info\n");
+
+    const auto opts = get_video_sample_options(info);
+
+    std::vector<DecodedFrameRGBA> frames;
+    if(!decode_video_ffmpeg_to_rgba(buffer, size, frames, opts.max_frames, std::max(1u, opts.stride))){
+        LOG_ERR("Unable to decode video from buffer\n");
+        return nullptr;
+    }
+    printf("decoded\n");
+
+    return convert_frames_to_bitmap(ctx, frames);
 }
 
 } // namespace mtmd_video
diff --git a/tools/mtmd/mtmd-video.h b/tools/mtmd/mtmd-video.h
index 479259147bed1..19e8a67799853 100644
--- a/tools/mtmd/mtmd-video.h
+++ b/tools/mtmd/mtmd-video.h
@@ -28,14 +28,22 @@ struct VideoInfo {
 // Check if a path is a video file based on its extension
 bool is_video_file(const std::string & path);
 
+// Check if a buffer contains video file data via FFmpeg
+// Notice: audio containers may also be recognized as valid media
+bool is_video_buffer(const uint8_t *data, size_t size);
+
 // get video info (fps, total frames) via FFmpeg (mp4/mov/mkv/avi/webm...).
 // Returns true on success, false otherwise.
-bool get_video_info_ffmpeg(const std::string &file, VideoInfo &info);
+bool get_video_info(const std::string &path, VideoInfo &info);
+bool get_video_info(const uint8_t* buffer, size_t size, VideoInfo &info);
 
 // Append frames loaded from a file or directory (auto-detect).
 // Returns a mtmd_bitmap containing all frames in RGB format.
-mtmd_bitmap* init_video_bitmap_from_path(mtmd_context * ctx,
+mtmd_bitmap* init_video_bitmap(mtmd_context * ctx,
                                const std::string & path);
+mtmd_bitmap* init_video_bitmap(mtmd_context * ctx,
+                                 const uint8_t* buffer,
+                                 size_t size);
 
 } // namespace mtmd_video
 

From efefc2a2756d51cee1fc3c4076ba69df72af41bf Mon Sep 17 00:00:00 2001
From: qijia <2421653893@qq.com>
Date: Fri, 31 Oct 2025 19:36:37 +0800
Subject: [PATCH 07/13] feat&fix: fix segmentation fault bug, add llama-server
 video_url api

---
 tools/mtmd/mtmd-video.cpp | 27 ++++++++++++++-------
 tools/server/README.md    |  4 ++--
 tools/server/server.cpp   | 10 ++++----
 tools/server/utils.hpp    | 50 +++++++++++++++++++++++++++++++++++++--
 4 files changed, 73 insertions(+), 18 deletions(-)

diff --git a/tools/mtmd/mtmd-video.cpp b/tools/mtmd/mtmd-video.cpp
index 60acee4a6eaeb..72d048c13d62a 100644
--- a/tools/mtmd/mtmd-video.cpp
+++ b/tools/mtmd/mtmd-video.cpp
@@ -160,21 +160,31 @@ bool is_video_buffer(const uint8_t *data, size_t size){
     if (!data || size < 16) return false; // too short
 
     AVProbeData probe;
-    probe.buf = const_cast<uint8_t*>(data);
-    probe.buf_size = (int)size;
     probe.filename = "";
 
     // ffmpeg requires that the last AVPROBE_PADDING_SIZE bytes of the buffer must be 0
-    std::vector<uint8_t> padded(size + AVPROBE_PADDING_SIZE);
-    memcpy(padded.data(), data, size);
-    memset(padded.data() + size, 0, AVPROBE_PADDING_SIZE);
-    probe.buf = padded.data();
+    auto * padded = new uint8_t[size + AVPROBE_PADDING_SIZE];
+    memcpy(padded, data, size);
+    memset(padded + size, 0, AVPROBE_PADDING_SIZE);
+    probe.buf = padded;
     probe.buf_size = (int)size;
 
-    const AVInputFormat *fmt = av_probe_input_format(&probe, 1);
-    if (!fmt) return false;
+    int score = 0;
+    const AVInputFormat *fmt = av_probe_input_format2(&probe, 1, &score);
+
+    delete [] padded;
+    
+    if (!fmt || score < 25) return false;
     if (fmt->flags & AVFMT_NOFILE) return false;
 
+    static const char *image_formats[] = {
+        "jpeg_pipe", "png_pipe", "bmp_pipe", "gif_pipe", "webp_pipe",
+        "tiff_pipe", "image2", "image2pipe", "mjpeg"
+    };
+    for (auto name : image_formats)
+        if (fmt->name && strstr(fmt->name, name))
+            return false;
+
     return true;
 }
 
@@ -592,7 +602,6 @@ mtmd_bitmap* init_video_bitmap(mtmd_context * ctx, const uint8_t* buffer, size_t
         LOG_ERR("Unable to get video info from buffer\n");
         return nullptr;
     }
-    printf("get info\n");
 
     const auto opts = get_video_sample_options(info);
 
diff --git a/tools/server/README.md b/tools/server/README.md
index 73b4cc6f03a28..b1893688afc89 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -430,7 +430,7 @@ Multiple prompts are also supported. In this case, the completion result will be
   - Strings, JSON objects, and sequences of tokens: `["string1", [12, 34, 56], { "prompt_string": "string", "multimodal_data": ["base64"]}]`
   - Mixed types: `[[12, 34, "string", 56, 78], [12, 34, 56], "string", { "prompt_string": "string" }]`
 
-Note for `multimodal_data` in JSON object prompts. This should be an array of strings, containing base64 encoded multimodal data such as images and audio. There must be an identical number of MTMD media markers in the string prompt element which act as placeholders for the data provided to this parameter. The multimodal data files will be substituted in order. The marker string (e.g. `<__media__>`) can be found by calling `mtmd_default_marker()` defined in [the MTMD C API](https://github.com/ggml-org/llama.cpp/blob/5fd160bbd9d70b94b5b11b0001fd7f477005e4a0/tools/mtmd/mtmd.h#L87). A client *must not* specify this field unless the server has the multimodal capability. Clients should check `/models` or `/v1/models` for the `multimodal` capability before a multimodal request.
+Note for `multimodal_data` in JSON object prompts. This should be an array of strings, containing base64 encoded multimodal data such as images, audio and video. There must be an identical number of MTMD media markers in the string prompt element which act as placeholders for the data provided to this parameter. The multimodal data files will be substituted in order. The marker string (e.g. `<__media__>`) can be found by calling `mtmd_default_marker()` defined in [the MTMD C API](https://github.com/ggml-org/llama.cpp/blob/5fd160bbd9d70b94b5b11b0001fd7f477005e4a0/tools/mtmd/mtmd.h#L87). A client *must not* specify this field unless the server has the multimodal capability. Clients should check `/models` or `/v1/models` for the `multimodal` capability before a multimodal request.
 
 `temperature`: Adjust the randomness of the generated text. Default: `0.8`
 
@@ -1210,7 +1210,7 @@ print(completion.choices[0].text)
 
 Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.
 
-If model supports multimodal, you can input the media file via `image_url` content part. We support both base64 and remote URL as input. See OAI documentation for more.
+If model supports multimodal, you can input the media file via `image_url` or `video_url` content part. We support both base64 and remote URL as input. See OAI documentation for more.
 
 *Options:*
 
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index d6072e5ece266..4197ffdcba08f 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -3638,21 +3638,21 @@ struct server_context {
                     // remove the non-common part from the cache
                     slot.cache_tokens.keep_first(slot.n_past);
 
-                    // check if we should process the image
+                    // check if we should process the media chunk (image, audio, video, ...)
                     if (slot.n_past < slot.n_prompt_tokens && slot.prompt_tokens[slot.n_past] == LLAMA_TOKEN_NULL) {
-                        // process the image
+                        // process the media chunk
                         int32_t new_n_past;
                         int32_t res = slot.prompt_tokens.process_chunk(ctx, mctx, slot.n_past, slot.id, new_n_past);
                         int32_t n_pos = new_n_past - slot.n_past;
 
                         if (res != 0) {
-                            SLT_ERR(slot, "failed to process image, res = %d\n", res);
+                            SLT_ERR(slot, "failed to process media, res = %d\n", res);
                             slot.release();
-                            send_error(slot, "failed to process image", ERROR_TYPE_SERVER);
+                            send_error(slot, "failed to process media", ERROR_TYPE_SERVER);
                             continue;
                         }
 
-                        // add the image chunk to cache
+                        // add the media chunk to cache
                         {
                             const auto & chunk = slot.prompt_tokens.find_chunk(slot.n_past);
                             slot.cache_tokens.push_back(chunk.get()); // copy
diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
index 64d702930ce96..0314afc290d25 100644
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@@ -677,7 +677,53 @@ static json oaicompat_chat_params_parse(
                 p["text"] = mtmd_default_marker();
                 p.erase("input_audio");
 
-            } else if (type != "text") {
+            } else if (type == "video_url") {
+                if (!opt.allow_image) { // TODO: separate video flag?
+                    throw std::runtime_error("video input is not supported - hint: if this is unexpected, you may need to provide the mmproj");
+                }
+
+                json video_url  = json_value(p, "video_url", json::object());
+                std::string url = json_value(video_url, "url", std::string());
+                if (string_starts_with(url, "http")) {
+                    // download remote image
+                    // TODO @ngxson : maybe make these params configurable
+                    common_remote_params params;
+                    params.headers.push_back("User-Agent: llama.cpp/" + build_info);
+                    params.max_size = 1024 * 1024 * 100; // 100MB
+                    params.timeout  = 100; // seconds
+                    SRV_INF("downloading video from '%s'\n", url.c_str());
+                    auto res = common_remote_get_content(url, params);
+                    if (200 <= res.first && res.first < 300) {
+                        SRV_INF("downloaded %ld bytes\n", res.second.size());
+                        raw_buffer data;
+                        data.insert(data.end(), res.second.begin(), res.second.end());
+                        out_files.push_back(data);
+                    } else {
+                        throw std::runtime_error("Failed to download video");
+                    }
+
+                } else {
+                    // try to decode base64 video
+                    std::vector<std::string> parts = string_split<std::string>(url, /*separator*/ ',');
+                    if (parts.size() != 2) {
+                        throw std::runtime_error("Invalid video_url.url value");
+                    } else if (!string_starts_with(parts[0], "data:video/")) {
+                        throw std::runtime_error("Invalid video_url.url format: " + parts[0]);
+                    } else if (!string_ends_with(parts[0], "base64")) {
+                        throw std::runtime_error("video_url.url must be base64 encoded");
+                    } else {
+                        auto base64_data = parts[1];
+                        auto decoded_data = base64_decode(base64_data);
+                        out_files.push_back(decoded_data);
+                    }
+                }
+
+                // replace this chunk with a marker
+                p["type"] = "text";
+                p["text"] = mtmd_default_marker();
+                p.erase("video_url");
+
+            }else if (type != "text") {
                 throw std::runtime_error("unsupported content[].type");
             }
         }
@@ -1401,7 +1447,7 @@ static server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt
     for (auto & file : files) {
         mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(mctx, file.data(), file.size()));
         if (!bmp.ptr) {
-            throw std::runtime_error("Failed to load image or audio file");
+            throw std::runtime_error("Failed to load media file");
         }
         // calculate bitmap hash (for KV caching)
         std::string hash = fnv_hash(bmp.data(), bmp.n_bytes());

From 32a45c482586380b8d47bc85ab8595f45a7e088d Mon Sep 17 00:00:00 2001
From: qijia <2421653893@qq.com>
Date: Fri, 31 Oct 2025 20:53:07 +0800
Subject: [PATCH 08/13] fix: max video frames

---
 tools/mtmd/mtmd-video.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tools/mtmd/mtmd-video.cpp b/tools/mtmd/mtmd-video.cpp
index 72d048c13d62a..c739a200967c0 100644
--- a/tools/mtmd/mtmd-video.cpp
+++ b/tools/mtmd/mtmd-video.cpp
@@ -561,8 +561,13 @@ static mtmd_video::LoadVideoOptions get_video_sample_options(mtmd_video::VideoIn
     opts.stride     = 1;
     opts.recursive  = false;
 
-    // minicpm frames sample method
+#ifdef MTMD_MAX_VIDEO_FRAMES_SMALL
+    // set a small number of frames for fast test locally
     const int32_t minicpmv_max_video_frames = 4;
+#else
+    // minicpm frames sample method
+    const int32_t minicpmv_max_video_frames = 64;
+#endif
     opts.max_frames = minicpmv_max_video_frames;
     if(info.total_frames > minicpmv_max_video_frames) {
         // uniform sample

From 5ee744fa732f3d319ded38d98f65d5d48cf1249b Mon Sep 17 00:00:00 2001
From: qijia <2421653893@qq.com>
Date: Fri, 31 Oct 2025 21:05:42 +0800
Subject: [PATCH 09/13] clean up

---
 .clang-tidy                |   2 +
 tools/mtmd/CMakeLists.txt  |   5 +
 tools/mtmd/mtmd-cli.cpp    |   4 +-
 tools/mtmd/mtmd-helper.cpp |  81 ++++++++++++++--
 tools/mtmd/mtmd-helper.h   |   8 +-
 tools/mtmd/mtmd-video.cpp  | 188 +++++++++++++------------------------
 tools/mtmd/mtmd.cpp        |   6 +-
 7 files changed, 156 insertions(+), 138 deletions(-)

diff --git a/.clang-tidy b/.clang-tidy
index 803b8b46a32f3..41f5f5ce626a5 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -14,6 +14,8 @@ Checks: >
     -readability-uppercase-literal-suffix,
     -readability-simplify-boolean-expr,
     -readability-math-missing-parentheses,
+    -readability-braces-around-statements,
+    -readability-isolate-declaration,
     clang-analyzer-*,
     -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
     performance-*,
diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt
index e135d4727f0a4..b4f10b882f96e 100644
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@@ -39,6 +39,11 @@ if (MTMD_WITH_FFMPEG)
     endif()
 endif()
 
+option(MTMD_MAX_VIDEO_FRAMES_SMALL "Set a small number of frames for fast test locally" OFF)
+if(MTMD_MAX_VIDEO_FRAMES_SMALL)
+    target_compile_definitions(mtmd PRIVATE MTMD_MAX_VIDEO_FRAMES_SMALL)
+endif()
+
 if (BUILD_SHARED_LIBS)
     set_target_properties     (mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
     target_compile_definitions(mtmd PRIVATE LLAMA_BUILD)
diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp
index fb5b666da07b1..ce08ff256a58d 100644
--- a/tools/mtmd/mtmd-cli.cpp
+++ b/tools/mtmd/mtmd-cli.cpp
@@ -153,8 +153,8 @@ struct mtmd_cli_context {
         );
     }
 
-    bool load_media(const std::string & fname) {
-        mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str()));
+    bool load_media(const std::string & path) {
+        mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), path.c_str()));
         if (!bmp.ptr) {
             return false;
         }
diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp
index 6cbaa7ac2d1cc..8f000be64e9d6 100644
--- a/tools/mtmd/mtmd-helper.cpp
+++ b/tools/mtmd/mtmd-helper.cpp
@@ -5,6 +5,10 @@
 #   define NOMINMAX
 #endif
 #include <windows.h>
+#else
+#include <dirent.h>
+#include <sys/stat.h>
+#include <sys/types.h>
 #endif
 
 #include "mtmd.h"
@@ -442,16 +446,16 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigne
     return result;
 }
 
-mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname) {
+mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * path) {
     // although we could read the file into memory and call mtmd_helper_bitmap_init_from_buf,
     // but for video files, it's better to let ffmpeg read from file
-    if(mtmd_video::is_video_file(fname)){
-        return mtmd_video::init_video_bitmap(ctx, fname);
+    if(mtmd_video::is_video_file(path) || mtmd_helper::is_dir(path)){
+        return mtmd_video::init_video_bitmap(ctx, path);
     }
 
-    FILE * f = fopen(fname, "rb");
+    FILE * f = fopen(path, "rb");
     if (!f) {
-        LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
+        LOG_ERR("Unable to open path %s: %s\n", path, strerror(errno));
         return nullptr;
     }
 
@@ -463,7 +467,7 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char *
     size_t n_read = fread(buf, 1, file_size, f);
     fclose(f);
     if (n_read != (size_t)file_size) {
-        LOG_ERR("Failed to read entire file %s", fname);
+        LOG_ERR("Failed to read entire path %s", path);
         return nullptr;
     }
 
@@ -471,3 +475,68 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char *
     delete [] buf;
     return res;
 }
+
+namespace mtmd_helper{
+
+bool has_image_ext(const std::string & name) {
+    auto lower = name;
+    std::transform(lower.begin(), lower.end(), lower.begin(), [](unsigned char c){ return (char)std::tolower(c); });
+    return lower.rfind(".jpg")  != std::string::npos ||
+           lower.rfind(".jpeg") != std::string::npos ||
+           lower.rfind(".png")  != std::string::npos ||
+           lower.rfind(".bmp")  != std::string::npos ||
+           lower.rfind(".gif")  != std::string::npos ||
+           lower.rfind(".webp") != std::string::npos;
+}
+
+bool is_dir(const std::string & path) {
+#if defined(_WIN32)
+    DWORD attrs = GetFileAttributesA(path.c_str());
+    return (attrs != INVALID_FILE_ATTRIBUTES) && (attrs & FILE_ATTRIBUTE_DIRECTORY);
+#else
+    struct stat st;
+    if (stat(path.c_str(), &st) != 0) return false;
+    return S_ISDIR(st.st_mode);
+#endif
+}
+
+void list_files(const std::string & dir, std::vector<std::string> & out, bool recursive) {
+#if defined(_WIN32)
+    std::string pattern = dir;
+    if (!pattern.empty() && pattern.back() != '/' && pattern.back() != '\\') pattern += "\\";
+    pattern += "*";
+    WIN32_FIND_DATAA ffd;
+    HANDLE hFind = FindFirstFileA(pattern.c_str(), &ffd);
+    if (hFind == INVALID_HANDLE_VALUE) return;
+    do {
+        std::string name = ffd.cFileName;
+        if (name == "." || name == "..") continue;
+        std::string path = dir;
+        if (!path.empty() && path.back() != '/' && path.back() != '\\') path += "\\";
+        path += name;
+        if (ffd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) {
+            if (recursive) list_files(path, out, recursive);
+        } else {
+            out.push_back(path);
+        }
+    } while (FindNextFileA(hFind, &ffd) != 0);
+    FindClose(hFind);
+#else
+    DIR * dp = opendir(dir.c_str());
+    if (!dp) return;
+    struct dirent * de;
+    while ((de = readdir(dp)) != nullptr) {
+        std::string name = de->d_name;
+        if (name == "." || name == "..") continue;
+        std::string path = dir + "/" + name;
+        if (is_dir(path)) {
+            if (recursive) list_files(path, out, recursive);
+        } else {
+            out.push_back(path);
+        }
+    }
+    closedir(dp);
+#endif
+}
+
+}
\ No newline at end of file
diff --git a/tools/mtmd/mtmd-helper.h b/tools/mtmd/mtmd-helper.h
index 5c0edc6937eee..73ad8c89b45c9 100644
--- a/tools/mtmd/mtmd-helper.h
+++ b/tools/mtmd/mtmd-helper.h
@@ -24,7 +24,7 @@ extern "C" {
 // it calls mtmd_helper_bitmap_init_from_buf() internally
 // returns nullptr on failure
 // this function is thread-safe
-MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname);
+MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * path);
 
 // helper function to construct a mtmd_bitmap from a buffer containing a file
 // supported formats:
@@ -88,4 +88,10 @@ MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
 // C++ wrappers
 //
 
+namespace mtmd_helper{
+    bool has_image_ext(const std::string & name);
+    bool is_dir(const std::string & path);
+    void list_files(const std::string & dir, std::vector<std::string> & out, bool recursive);
+}
+
 #endif
diff --git a/tools/mtmd/mtmd-video.cpp b/tools/mtmd/mtmd-video.cpp
index c739a200967c0..86865b943be2b 100644
--- a/tools/mtmd/mtmd-video.cpp
+++ b/tools/mtmd/mtmd-video.cpp
@@ -1,5 +1,6 @@
 #include "mtmd-video.h"
 #include "clip-impl.h"
+#include "ggml.h"
 #include "mtmd-helper.h"
 
 #include <algorithm>
@@ -11,82 +12,6 @@
 #include <memory>
 #include <cmath>
 
-#if defined(_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#ifndef NOMINMAX
-#   define NOMINMAX
-#endif
-#include <windows.h>
-#else
-#include <dirent.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#endif
-
-namespace {
-
-static bool has_image_ext(const std::string & name) {
-    auto lower = name;
-    std::transform(lower.begin(), lower.end(), lower.begin(), [](unsigned char c){ return (char)std::tolower(c); });
-    return lower.rfind(".jpg")  != std::string::npos ||
-           lower.rfind(".jpeg") != std::string::npos ||
-           lower.rfind(".png")  != std::string::npos ||
-           lower.rfind(".bmp")  != std::string::npos ||
-           lower.rfind(".gif")  != std::string::npos ||
-           lower.rfind(".webp") != std::string::npos;
-}
-
-static bool is_dir(const std::string & path) {
-#if defined(_WIN32)
-    DWORD attrs = GetFileAttributesA(path.c_str());
-    return (attrs != INVALID_FILE_ATTRIBUTES) && (attrs & FILE_ATTRIBUTE_DIRECTORY);
-#else
-    struct stat st;
-    if (stat(path.c_str(), &st) != 0) return false;
-    return S_ISDIR(st.st_mode);
-#endif
-}
-
-static void list_files(const std::string & dir, std::vector<std::string> & out, bool recursive) {
-#if defined(_WIN32)
-    std::string pattern = dir;
-    if (!pattern.empty() && pattern.back() != '/' && pattern.back() != '\\') pattern += "\\";
-    pattern += "*";
-    WIN32_FIND_DATAA ffd;
-    HANDLE hFind = FindFirstFileA(pattern.c_str(), &ffd);
-    if (hFind == INVALID_HANDLE_VALUE) return;
-    do {
-        std::string name = ffd.cFileName;
-        if (name == "." || name == "..") continue;
-        std::string path = dir;
-        if (!path.empty() && path.back() != '/' && path.back() != '\\') path += "\\";
-        path += name;
-        if (ffd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) {
-            if (recursive) list_files(path, out, recursive);
-        } else {
-            out.push_back(path);
-        }
-    } while (FindNextFileA(hFind, &ffd) != 0);
-    FindClose(hFind);
-#else
-    DIR * dp = opendir(dir.c_str());
-    if (!dp) return;
-    struct dirent * de;
-    while ((de = readdir(dp)) != nullptr) {
-        std::string name = de->d_name;
-        if (name == "." || name == "..") continue;
-        std::string path = dir + "/" + name;
-        if (is_dir(path)) {
-            if (recursive) list_files(path, out, recursive);
-        } else {
-            out.push_back(path);
-        }
-    }
-    closedir(dp);
-#endif
-}
-
-} // namespace
 
 namespace mtmd_video {
 
@@ -100,50 +25,61 @@ bool is_video_file(const std::string & path){
            lower.rfind(".webm") != std::string::npos;
 }
 
+static bool get_video_info_from_dir(const std::string &path, VideoInfo &info){
+    info.fps = 1; // do not care
+    std::vector<std::string> files;
+    mtmd_helper::list_files(path, files, true); // recursive
+    info.total_frames = files.size();
+    return true;
+}
 // untested
 static mtmd_bitmap* load_frames_from_dir(mtmd_context * ctx,
                           const std::string & dir_path,
                           const LoadVideoOptions & opts) {
-    if (!ctx || dir_path.empty() || !is_dir(dir_path) || opts.max_frames < 1) {
+    if (!ctx || dir_path.empty() || !mtmd_helper::is_dir(dir_path) || opts.max_frames < 1) {
         return nullptr;
     }
-    // note: hparam-based control is applied inside clip.cpp; nothing to set globally here
-
     std::vector<std::string> files;
-    list_files(dir_path, files, opts.recursive);
+    mtmd_helper::list_files(dir_path, files, opts.recursive);
     std::sort(files.begin(), files.end());
 
     auto stride = std::max(1u, opts.stride);
     size_t loaded = 0;
     unsigned char* dest = nullptr;
     mtmd_bitmap* out_frames = nullptr;
+    const auto nframes = std::min(files.size() / stride, (size_t)opts.max_frames);
 
     uint32_t w=0, h=0;
     for (size_t i = 0; i < files.size(); i++) {
         if (i % stride != 0) continue;
         const std::string & f = files[i];
-        if (!has_image_ext(f)) continue;
+        if (!mtmd_helper::has_image_ext(f)) continue;
         mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx, f.c_str()));
         if (!bmp.ptr) continue;
         if(loaded==0){
             w = bmp.nx();
             h = bmp.ny();
-            out_frames = mtmd_bitmap_init_from_video(w, h, loaded, nullptr);
+            out_frames = mtmd_bitmap_init_from_video(w, h, nframes, nullptr);
             dest = mtmd_bitmap_get_data_mutable(out_frames);
-        }else if(bmp.nx() != w || bmp.ny() != h){
-            return nullptr; // all frames must have the same size
         }
+        GGML_ASSERT(bmp.nx() == w && bmp.ny() == h); // all frames must have the same size
         std::memcpy(dest,
                     bmp.data(),
                     bmp.n_bytes());
         dest += bmp.n_bytes();
         loaded++;
-        if (loaded >= opts.max_frames) break;
+        if (loaded >= nframes) break;
     }
     
     return out_frames;
 }
 
+struct DecodedFrameRGBA {
+    int width;
+    int height;
+    std::vector<unsigned char> rgba; // size = width * height * 4
+};
+
 // --- FFmpeg-based file decoding (optional) ---
 
 #ifdef MTMD_WITH_FFMPEG
@@ -181,19 +117,13 @@ bool is_video_buffer(const uint8_t *data, size_t size){
         "jpeg_pipe", "png_pipe", "bmp_pipe", "gif_pipe", "webp_pipe",
         "tiff_pipe", "image2", "image2pipe", "mjpeg"
     };
-    for (auto name : image_formats)
+    for (const auto* name : image_formats)
         if (fmt->name && strstr(fmt->name, name))
             return false;
 
     return true;
 }
 
-struct DecodedFrameRGBA {
-    int width;
-    int height;
-    std::vector<unsigned char> rgba; // size = width * height * 4
-};
-
 struct BufferData {
     const uint8_t* base;
     size_t size;
@@ -298,7 +228,7 @@ static bool create_format_context_from_buffer(const uint8_t* buffer, size_t size
             BufferData* bd_from_fmt = pb ? static_cast<BufferData*>(pb->opaque) : nullptr;
             avformat_free_context(fmt);
             if (pb) {
-                if (bd_from_fmt) delete bd_from_fmt;
+                delete bd_from_fmt;
                 avio_context_free(&pb); // frees pb->buffer
             }
             fmt = nullptr;
@@ -342,7 +272,7 @@ static void free_format_context_from_buffer(AVFormatContext* fmt,
     // partial failure case: fmt is null but avio_ctx may still be valid
     if (avio_ctx) {
         BufferData* bd = static_cast<BufferData*>(avio_ctx->opaque);
-        if (bd) delete bd;
+        delete bd;
         avio_context_free(&avio_ctx); // frees avio_ctx->buffer
         // avio_ctx_buffer already freed by avio_context_free
         return;
@@ -396,13 +326,8 @@ bool get_video_info(const uint8_t* buffer, size_t size, VideoInfo &info) {
 
 // from file
 bool get_video_info(const std::string &path, VideoInfo &info) {
-    if(is_dir(path)) {
-        info.fps = 1; // do not care
-        std::vector<std::string> files;
-        list_files(path, files, true); // recursive
-        info.total_frames = files.size();
-        return true;
-    }
+    if(mtmd_helper::is_dir(path)) return get_video_info_from_dir(path, info);
+
     AVFormatContext* fmt = nullptr;
     if (avformat_open_input(&fmt, path.c_str(), nullptr, nullptr) < 0)
         return false;
@@ -464,7 +389,6 @@ static bool decode_video_ffmpeg_to_rgba_from_format_ctx(
             taken++;
             if (taken >= max_frames) break;
         }
-        if (taken >= max_frames) break;
     }
     if (sws) sws_freeContext(sws);
     return taken > 0;
@@ -508,6 +432,38 @@ static bool decode_video_ffmpeg_to_rgba(
     free_format_context_from_buffer(fmt, avio_ctx);
     return ok;
 }
+#else
+bool get_video_info(const std::string &path, VideoInfo &info){
+    if(mtmd_helper::is_dir(path)) return get_video_info_from_dir(path, info);
+    LOG_ERR("FFmpeg support is not enabled in this build\n");
+    return false;
+}
+bool get_video_info(const uint8_t* /*buffer*/, size_t /*size*/, VideoInfo &/*info*/){
+    LOG_ERR("FFmpeg support is not enabled in this build\n");
+    return false;
+}
+bool is_video_buffer(const uint8_t */*data*/, size_t /*size*/){
+    LOG_ERR("FFmpeg support is not enabled in this build\n");
+    return false;
+}
+static bool decode_video_ffmpeg_to_rgba(
+    const std::string& /*file*/,
+    std::vector<DecodedFrameRGBA>& /*frames*/,
+    int /*max_frames*/,
+    int /*stride*/)
+{
+    return false;   
+}
+static bool decode_video_ffmpeg_to_rgba(
+    const uint8_t* /*buffer*/,
+    size_t /*size*/,
+    std::vector<DecodedFrameRGBA>& /*frames*/,
+    int /*max_frames*/,
+    int /*stride*/)
+{
+    return false;   
+}
+#endif
 
 static mtmd_bitmap* convert_frames_to_bitmap(mtmd_context * ctx, const std::vector<DecodedFrameRGBA>& decoded) {
     if (!ctx) return nullptr;
@@ -521,7 +477,7 @@ static mtmd_bitmap* convert_frames_to_bitmap(mtmd_context * ctx, const std::vect
     mtmd_bitmap* out_frames = mtmd_bitmap_init_from_video(uint32_t(w), uint32_t(h), uint32_t(nframes), nullptr);
     unsigned char * dst = mtmd_bitmap_get_data_mutable(out_frames);
 
-    for (auto & fr : decoded) {
+    for (const auto & fr : decoded) {
         GGML_ASSERT(w == fr.width && h == fr.height);
         const unsigned char * src = fr.rgba.data();
         for (int i = 0; i < w * h; ++i) {
@@ -535,25 +491,6 @@ static mtmd_bitmap* convert_frames_to_bitmap(mtmd_context * ctx, const std::vect
 
     return out_frames;
 }
-#else
-static mtmd_bitmap* load_frames_from_file(mtmd_context * /*ctx*/,
-                           const std::string & /*file_path*/,
-                           const LoadVideoOptions & /*opts*/) {
-    return nullptr;
-}
-bool get_video_info(const std::string &path, VideoInfo &info){
-    LOG_ERR("FFmpeg support is not enabled in this build\n");
-    return false;
-}
-bool get_video_info(const uint8_t* buffer, size_t size, VideoInfo &info){
-    LOG_ERR("FFmpeg support is not enabled in this build\n");
-    return false;
-}
-bool is_video_buffer(const uint8_t *data, size_t size){
-    LOG_ERR("FFmpeg support is not enabled in this build\n");
-    return false;
-}
-#endif
 
 static mtmd_video::LoadVideoOptions get_video_sample_options(mtmd_video::VideoInfo info){
     mtmd_video::LoadVideoOptions opts;
@@ -561,11 +498,12 @@ static mtmd_video::LoadVideoOptions get_video_sample_options(mtmd_video::VideoIn
     opts.stride     = 1;
     opts.recursive  = false;
 
+    /* MiniCPM-V normal-speed video frames sample method */
+
 #ifdef MTMD_MAX_VIDEO_FRAMES_SMALL
     // set a small number of frames for fast test locally
     const int32_t minicpmv_max_video_frames = 4;
 #else
-    // minicpm frames sample method
     const int32_t minicpmv_max_video_frames = 64;
 #endif
     opts.max_frames = minicpmv_max_video_frames;
@@ -576,6 +514,7 @@ static mtmd_video::LoadVideoOptions get_video_sample_options(mtmd_video::VideoIn
         // 1 frame per second
         opts.stride = (info.fps > 1.0) ? (int)std::ceil(info.fps) : 1;
     }
+    
     return opts;
 }
 
@@ -588,7 +527,7 @@ mtmd_bitmap* init_video_bitmap(mtmd_context * ctx, const std::string & path) {
 
     const auto opts = get_video_sample_options(info);
 
-    if (is_dir(path)) {
+    if (mtmd_helper::is_dir(path)) {
         return load_frames_from_dir(ctx, path, opts);
     }
 
@@ -615,7 +554,6 @@ mtmd_bitmap* init_video_bitmap(mtmd_context * ctx, const uint8_t* buffer, size_t
         LOG_ERR("Unable to decode video from buffer\n");
         return nullptr;
     }
-    printf("decoded\n");
 
     return convert_frames_to_bitmap(ctx, frames);
 }
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index 615b3faf165eb..bfae8f3239524 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -96,8 +96,7 @@ enum mtmd_slice_tmpl {
 
 // video schema, used to tokenize video frames 
 enum mtmd_video_schema {
-    MTMD_VIDEO_SCHEMA_NONE,
-    MTMD_VIDEO_SCHEMA_MINICPMV_2_6,
+    MTMD_VIDEO_SCHEMA_MINICPMV_2_6, // add overview image only (no slicing)
     MTMD_VIDEO_SCHEMA_MINICPMV_4_5,
 };
 
@@ -136,7 +135,7 @@ struct mtmd_context {
     // for llava-uhd style models, we need special tokens in-between slices
     // minicpmv calls them "slices", llama 4 calls them "tiles"
     mtmd_slice_tmpl slice_tmpl    = MTMD_SLICE_TMPL_NONE;
-    mtmd_video_schema video_sch   = MTMD_VIDEO_SCHEMA_NONE;
+    mtmd_video_schema video_sch   = MTMD_VIDEO_SCHEMA_MINICPMV_2_6;
     llama_token tok_ov_img_start  = LLAMA_TOKEN_NULL; // overview image
     llama_token tok_ov_img_end    = LLAMA_TOKEN_NULL; // overview image
     llama_token tok_slices_start  = LLAMA_TOKEN_NULL; // start of all slices
@@ -234,7 +233,6 @@ struct mtmd_context {
             // minicpmv 2.6 format:
             // <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
             slice_tmpl        = MTMD_SLICE_TMPL_MINICPMV_2_6;
-            video_sch         = MTMD_VIDEO_SCHEMA_MINICPMV_2_6;
             tok_ov_img_start  = lookup_token("<image>");
             tok_ov_img_end    = lookup_token("</image>");
             tok_sli_img_start = lookup_token("<slice>");

From 6e8c9f653ee3d916fb9ebdab7b93f9507ca5f697 Mon Sep 17 00:00:00 2001
From: qijia <2421653893@qq.com>
Date: Sat, 1 Nov 2025 01:27:56 +0800
Subject: [PATCH 10/13] refactor: get_video_info and decode_video use the same
 AVFormatContext

---
 tools/mtmd/mtmd-video.cpp | 238 ++++++++++++++------------------------
 tools/mtmd/mtmd-video.h   |   5 -
 2 files changed, 89 insertions(+), 154 deletions(-)

diff --git a/tools/mtmd/mtmd-video.cpp b/tools/mtmd/mtmd-video.cpp
index 86865b943be2b..88892d389535f 100644
--- a/tools/mtmd/mtmd-video.cpp
+++ b/tools/mtmd/mtmd-video.cpp
@@ -25,12 +25,11 @@ bool is_video_file(const std::string & path){
            lower.rfind(".webm") != std::string::npos;
 }
 
-static bool get_video_info_from_dir(const std::string &path, VideoInfo &info){
+static void get_video_info_from_dir(const std::string &path, VideoInfo &info){
     info.fps = 1; // do not care
     std::vector<std::string> files;
     mtmd_helper::list_files(path, files, true); // recursive
     info.total_frames = files.size();
-    return true;
 }
 // untested
 static mtmd_bitmap* load_frames_from_dir(mtmd_context * ctx,
@@ -80,6 +79,32 @@ struct DecodedFrameRGBA {
     std::vector<unsigned char> rgba; // size = width * height * 4
 };
 
+static mtmd_video::LoadVideoOptions get_video_sample_options(mtmd_video::VideoInfo info){
+    mtmd_video::LoadVideoOptions opts;
+    opts.max_frames = 32;
+    opts.stride     = 1;
+    opts.recursive  = false;
+
+    /* MiniCPM-V normal-speed video frames sample method */
+
+#ifdef MTMD_MAX_VIDEO_FRAMES_SMALL
+    // set a small number of frames for fast test locally
+    const int32_t minicpmv_max_video_frames = 4;
+#else
+    const int32_t minicpmv_max_video_frames = 64;
+#endif
+    opts.max_frames = minicpmv_max_video_frames;
+    if(info.total_frames > minicpmv_max_video_frames) {
+        // uniform sample
+        opts.stride = (int)std::ceil((double)info.total_frames / minicpmv_max_video_frames);
+    } else {
+        // 1 frame per second
+        opts.stride = (info.fps > 1.0) ? (int)std::ceil(info.fps) : 1;
+    }
+    
+    return opts;
+}
+
 // --- FFmpeg-based file decoding (optional) ---
 
 #ifdef MTMD_WITH_FFMPEG
@@ -165,11 +190,10 @@ static int64_t seek_packet(void* opaque, int64_t offset, int whence) {
 
 static bool create_format_context_from_buffer(const uint8_t* buffer, size_t size,
                                        AVFormatContext*& fmt,
-                                       AVIOContext*& avio_ctx,
-                                       uint8_t*& avio_ctx_buffer) {
+                                       AVIOContext*& avio_ctx) {
     fmt = nullptr;
     avio_ctx = nullptr;
-    avio_ctx_buffer = nullptr;
+    uint8_t* avio_ctx_buffer = nullptr;
 
     if (!buffer || size == 0) return false;
 
@@ -312,39 +336,13 @@ static bool get_video_info_from_format_ctx(AVFormatContext *fmt, VideoInfo &info
     return true;
 }
 
-// from buffer
-bool get_video_info(const uint8_t* buffer, size_t size, VideoInfo &info) {
-    AVFormatContext* fmt = nullptr;
-    AVIOContext* avio_ctx = nullptr;
-    uint8_t* avio_ctx_buffer = nullptr;
-
-    GGML_ASSERT(create_format_context_from_buffer(buffer, size, fmt, avio_ctx, avio_ctx_buffer));
-    bool ok = get_video_info_from_format_ctx(fmt, info);
-    free_format_context_from_buffer(fmt, avio_ctx);
-    return ok;
-}
-
-// from file
-bool get_video_info(const std::string &path, VideoInfo &info) {
-    if(mtmd_helper::is_dir(path)) return get_video_info_from_dir(path, info);
-
-    AVFormatContext* fmt = nullptr;
-    if (avformat_open_input(&fmt, path.c_str(), nullptr, nullptr) < 0)
-        return false;
-
-    std::unique_ptr<AVFormatContext, void(*)(AVFormatContext*)> fmt_guard(fmt, [](AVFormatContext* f){
-        if (f) avformat_close_input(&f);
-    });
-
-    return get_video_info_from_format_ctx(fmt, info);
-}
-
 static bool decode_video_ffmpeg_to_rgba_from_format_ctx(
     AVFormatContext* fmt,
     std::vector<DecodedFrameRGBA>& frames,
-    int max_frames,
-    int stride) 
+    mtmd_video::LoadVideoOptions opts) 
 {
+    const auto stride = opts.stride;
+    const auto max_frames = opts.max_frames;
     if(!fmt || stride <= 0 || max_frames <= 0) return false;
     if (avformat_find_stream_info(fmt, nullptr) < 0) return false;
     int vstream = av_find_best_stream(fmt, AVMEDIA_TYPE_VIDEO, -1, -1, nullptr, 0);
@@ -364,8 +362,8 @@ static bool decode_video_ffmpeg_to_rgba_from_format_ctx(
     std::unique_ptr<AVPacket, void(*)(AVPacket*)> pkt_guard(pkt, [](AVPacket *p){ if (p) av_packet_free(&p); });
 
     SwsContext * sws = nullptr;
-    int idx = 0;
-    int taken = 0;
+    uint32_t idx = 0;
+    uint32_t taken = 0;
     while (av_read_frame(fmt, pkt) >= 0) {
         if (pkt->stream_index != vstream) { av_packet_unref(pkt); continue; }
         if (avcodec_send_packet(ctx, pkt) < 0) { av_packet_unref(pkt); break; }
@@ -394,77 +392,6 @@ static bool decode_video_ffmpeg_to_rgba_from_format_ctx(
     return taken > 0;
 }
 
-// from file
-static bool decode_video_ffmpeg_to_rgba(
-    const std::string& file,
-    std::vector<DecodedFrameRGBA>& frames,
-    int max_frames,
-    int stride)
-{
-    AVFormatContext* fmt = nullptr;
-    if (avformat_open_input(&fmt, file.c_str(), nullptr, nullptr) < 0)
-        return false;
-
-    std::unique_ptr<AVFormatContext, void(*)(AVFormatContext*)> fmt_guard(fmt, [](AVFormatContext* f){
-        if (f) avformat_close_input(&f);
-    });
-
-    return decode_video_ffmpeg_to_rgba_from_format_ctx(fmt, frames, max_frames, stride);
-}
-
-// from buffer
-static bool decode_video_ffmpeg_to_rgba(
-    const uint8_t* buffer,
-    size_t size,
-    std::vector<DecodedFrameRGBA>& frames,
-    int max_frames,
-    int stride)
-{
-    if (!buffer || size == 0) return false;
-    AVFormatContext* fmt = nullptr;
-    AVIOContext* avio_ctx = nullptr;
-    uint8_t* avio_ctx_buffer = nullptr;
-
-    GGML_ASSERT(create_format_context_from_buffer(buffer, size, fmt, avio_ctx, avio_ctx_buffer));
-    
-    bool ok = decode_video_ffmpeg_to_rgba_from_format_ctx(fmt, frames, max_frames, stride);
-
-    free_format_context_from_buffer(fmt, avio_ctx);
-    return ok;
-}
-#else
-bool get_video_info(const std::string &path, VideoInfo &info){
-    if(mtmd_helper::is_dir(path)) return get_video_info_from_dir(path, info);
-    LOG_ERR("FFmpeg support is not enabled in this build\n");
-    return false;
-}
-bool get_video_info(const uint8_t* /*buffer*/, size_t /*size*/, VideoInfo &/*info*/){
-    LOG_ERR("FFmpeg support is not enabled in this build\n");
-    return false;
-}
-bool is_video_buffer(const uint8_t */*data*/, size_t /*size*/){
-    LOG_ERR("FFmpeg support is not enabled in this build\n");
-    return false;
-}
-static bool decode_video_ffmpeg_to_rgba(
-    const std::string& /*file*/,
-    std::vector<DecodedFrameRGBA>& /*frames*/,
-    int /*max_frames*/,
-    int /*stride*/)
-{
-    return false;   
-}
-static bool decode_video_ffmpeg_to_rgba(
-    const uint8_t* /*buffer*/,
-    size_t /*size*/,
-    std::vector<DecodedFrameRGBA>& /*frames*/,
-    int /*max_frames*/,
-    int /*stride*/)
-{
-    return false;   
-}
-#endif
-
 static mtmd_bitmap* convert_frames_to_bitmap(mtmd_context * ctx, const std::vector<DecodedFrameRGBA>& decoded) {
     if (!ctx) return nullptr;
     if(decoded.empty()) return nullptr;
@@ -492,70 +419,83 @@ static mtmd_bitmap* convert_frames_to_bitmap(mtmd_context * ctx, const std::vect
     return out_frames;
 }
 
-static mtmd_video::LoadVideoOptions get_video_sample_options(mtmd_video::VideoInfo info){
-    mtmd_video::LoadVideoOptions opts;
-    opts.max_frames = 32;
-    opts.stride     = 1;
-    opts.recursive  = false;
-
-    /* MiniCPM-V normal-speed video frames sample method */
-
-#ifdef MTMD_MAX_VIDEO_FRAMES_SMALL
-    // set a small number of frames for fast test locally
-    const int32_t minicpmv_max_video_frames = 4;
-#else
-    const int32_t minicpmv_max_video_frames = 64;
-#endif
-    opts.max_frames = minicpmv_max_video_frames;
-    if(info.total_frames > minicpmv_max_video_frames) {
-        // uniform sample
-        opts.stride = (int)std::ceil((double)info.total_frames / minicpmv_max_video_frames);
-    } else {
-        // 1 frame per second
-        opts.stride = (info.fps > 1.0) ? (int)std::ceil(info.fps) : 1;
-    }
-    
-    return opts;
-}
-
-mtmd_bitmap* init_video_bitmap(mtmd_context * ctx, const std::string & path) {
+mtmd_bitmap* init_video_bitmap(mtmd_context * ctx, const uint8_t* buffer, size_t size){
     auto info = mtmd_video::VideoInfo{};
-    if(!mtmd_video::get_video_info(path, info)) {
-        LOG_ERR("Unable to get video info from path: %s\n", path.c_str());
+    AVFormatContext* fmt = nullptr;
+    AVIOContext* avio_ctx = nullptr;
+    GGML_ASSERT(create_format_context_from_buffer(buffer, size, fmt, avio_ctx));
+
+    if(!get_video_info_from_format_ctx(fmt, info)) {
+        LOG_ERR("Unable to get video info from buffer\n");
+        free_format_context_from_buffer(fmt, avio_ctx);
         return nullptr;
     }
 
     const auto opts = get_video_sample_options(info);
 
-    if (mtmd_helper::is_dir(path)) {
-        return load_frames_from_dir(ctx, path, opts);
-    }
-
     std::vector<DecodedFrameRGBA> frames;
-    if(!decode_video_ffmpeg_to_rgba(path, frames, opts.max_frames, std::max(1u, opts.stride))){
-        LOG_ERR("Unable to decode video from path: %s\n", path.c_str());
+    if(!decode_video_ffmpeg_to_rgba_from_format_ctx(fmt, frames, opts)){
+        LOG_ERR("Unable to decode video from buffer\n");
+        free_format_context_from_buffer(fmt, avio_ctx);
         return nullptr;
     }
 
-    return convert_frames_to_bitmap(ctx, frames);
+    auto * res = convert_frames_to_bitmap(ctx, frames);
+    free_format_context_from_buffer(fmt, avio_ctx);
+    return res;
+}
+#else
+bool is_video_buffer(const uint8_t */*data*/, size_t /*size*/){
+    LOG_WRN("FFmpeg support is not enabled in this build, can not check it\n");
+    return false;
+}
+mtmd_bitmap* init_video_bitmap(mtmd_context * /*ctx*/, const uint8_t* /*buffer*/, size_t /*size*/){
+    LOG_ERR("FFmpeg support is not enabled in this build, can not load video from buffer\n");
+    return nullptr;
 }
+#endif
 
-mtmd_bitmap* init_video_bitmap(mtmd_context * ctx, const uint8_t* buffer, size_t size){
+mtmd_bitmap* init_video_bitmap(mtmd_context * ctx, const std::string & path) {
     auto info = mtmd_video::VideoInfo{};
-    if(!mtmd_video::get_video_info(buffer, size, info)) {
-        LOG_ERR("Unable to get video info from buffer\n");
+    
+    if(mtmd_helper::is_dir(path)){
+        get_video_info_from_dir(path, info);
+        const auto opts = get_video_sample_options(info);
+        return load_frames_from_dir(ctx, path, opts);
+    }
+
+    // handle file otherwise
+
+    #ifdef MTMD_WITH_FFMPEG
+    AVFormatContext* fmt = nullptr;
+
+    if (avformat_open_input(&fmt, path.c_str(), nullptr, nullptr) < 0){
+        LOG_ERR("Unable to open video from path: %s\n", path.c_str());
+        if(fmt) avformat_close_input(&fmt);
+        return nullptr;
+    }
+    if(!get_video_info_from_format_ctx(fmt, info)) {
+        LOG_ERR("Unable to get video info from path: %s\n", path.c_str());
+        if(fmt) avformat_close_input(&fmt);
         return nullptr;
     }
 
     const auto opts = get_video_sample_options(info);
 
     std::vector<DecodedFrameRGBA> frames;
-    if(!decode_video_ffmpeg_to_rgba(buffer, size, frames, opts.max_frames, std::max(1u, opts.stride))){
-        LOG_ERR("Unable to decode video from buffer\n");
+    if(!decode_video_ffmpeg_to_rgba_from_format_ctx(fmt, frames, opts)){
+        LOG_ERR("Unable to decode video from path: %s\n", path.c_str());
+        if(fmt) avformat_close_input(&fmt);
         return nullptr;
     }
 
-    return convert_frames_to_bitmap(ctx, frames);
+    auto * res = convert_frames_to_bitmap(ctx, frames);
+    if(fmt) avformat_close_input(&fmt);
+    return res;
+    #else
+    LOG_ERR("FFmpeg support is not enabled in this build, can not load video from file\n");
+    return nullptr;
+    #endif
 }
 
 } // namespace mtmd_video
diff --git a/tools/mtmd/mtmd-video.h b/tools/mtmd/mtmd-video.h
index 19e8a67799853..23795bf350470 100644
--- a/tools/mtmd/mtmd-video.h
+++ b/tools/mtmd/mtmd-video.h
@@ -32,11 +32,6 @@ bool is_video_file(const std::string & path);
 // Notice: audio containers may also be recognized as valid media
 bool is_video_buffer(const uint8_t *data, size_t size);
 
-// get video info (fps, total frames) via FFmpeg (mp4/mov/mkv/avi/webm...).
-// Returns true on success, false otherwise.
-bool get_video_info(const std::string &path, VideoInfo &info);
-bool get_video_info(const uint8_t* buffer, size_t size, VideoInfo &info);
-
 // Append frames loaded from a file or directory (auto-detect).
 // Returns a mtmd_bitmap containing all frames in RGB format.
 mtmd_bitmap* init_video_bitmap(mtmd_context * ctx,

From 113cfc208761d31bcee82714f52a63d6605ec59e Mon Sep 17 00:00:00 2001
From: qijia <2421653893@qq.com>
Date: Mon, 3 Nov 2025 10:23:40 +0800
Subject: [PATCH 11/13] undo the changes and submit a separate pull request to
 the llama-server later

---
 tools/server/README.md  |  4 ++--
 tools/server/server.cpp | 10 ++++-----
 tools/server/utils.hpp  | 50 ++---------------------------------------
 3 files changed, 9 insertions(+), 55 deletions(-)

diff --git a/tools/server/README.md b/tools/server/README.md
index 673d094fd0197..c16d0bd6dcd7f 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -430,7 +430,7 @@ Multiple prompts are also supported. In this case, the completion result will be
   - Strings, JSON objects, and sequences of tokens: `["string1", [12, 34, 56], { "prompt_string": "string", "multimodal_data": ["base64"]}]`
   - Mixed types: `[[12, 34, "string", 56, 78], [12, 34, 56], "string", { "prompt_string": "string" }]`
 
-Note for `multimodal_data` in JSON object prompts. This should be an array of strings, containing base64 encoded multimodal data such as images, audio and video. There must be an identical number of MTMD media markers in the string prompt element which act as placeholders for the data provided to this parameter. The multimodal data files will be substituted in order. The marker string (e.g. `<__media__>`) can be found by calling `mtmd_default_marker()` defined in [the MTMD C API](https://github.com/ggml-org/llama.cpp/blob/5fd160bbd9d70b94b5b11b0001fd7f477005e4a0/tools/mtmd/mtmd.h#L87). A client *must not* specify this field unless the server has the multimodal capability. Clients should check `/models` or `/v1/models` for the `multimodal` capability before a multimodal request.
+Note for `multimodal_data` in JSON object prompts. This should be an array of strings, containing base64 encoded multimodal data such as images and audio. There must be an identical number of MTMD media markers in the string prompt element which act as placeholders for the data provided to this parameter. The multimodal data files will be substituted in order. The marker string (e.g. `<__media__>`) can be found by calling `mtmd_default_marker()` defined in [the MTMD C API](https://github.com/ggml-org/llama.cpp/blob/5fd160bbd9d70b94b5b11b0001fd7f477005e4a0/tools/mtmd/mtmd.h#L87). A client *must not* specify this field unless the server has the multimodal capability. Clients should check `/models` or `/v1/models` for the `multimodal` capability before a multimodal request.
 
 `temperature`: Adjust the randomness of the generated text. Default: `0.8`
 
@@ -1211,7 +1211,7 @@ print(completion.choices[0].text)
 
 Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.
 
-If model supports multimodal, you can input the media file via `image_url` or `video_url` content part. We support both base64 and remote URL as input. See OAI documentation for more.
+If model supports multimodal, you can input the media file via `image_url` content part. We support both base64 and remote URL as input. See OAI documentation for more.
 
 *Options:*
 
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 740b687905e75..723b5a2d15bb0 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -3926,20 +3926,20 @@ struct server_context {
 
                     SLT_INF(slot, "n_tokens = %d, memory_seq_rm [%d, end)\n", slot.prompt.n_tokens(), p0);
 
-                    // check if we should process the media chunk (image, audio, video, ...)
+                    // check if we should process the image
                     if (slot.prompt.n_tokens() < slot.task->n_tokens() && input_tokens[slot.prompt.n_tokens()] == LLAMA_TOKEN_NULL) {
-                        // process the media
+                        // process the image
                         size_t n_tokens_out = 0;
                         int32_t res = input_tokens.process_chunk(ctx, mctx, slot.prompt.n_tokens(), slot.prompt.tokens.pos_next(), slot.id, n_tokens_out);
                         if (res != 0) {
-                            SLT_ERR(slot, "failed to process media, res = %d\n", res);
-                            send_error(slot, "failed to process media", ERROR_TYPE_SERVER);
+                            SLT_ERR(slot, "failed to process image, res = %d\n", res);
+                            send_error(slot, "failed to process image", ERROR_TYPE_SERVER);
                             slot.release();
                             continue;
                         }
 
                         slot.n_prompt_tokens_processed += n_tokens_out;
-                        // add the media chunk to cache
+                        // add the image chunk to cache
                         {
                             const auto & chunk = input_tokens.find_chunk(slot.prompt.n_tokens());
                             slot.prompt.tokens.push_back(chunk.get()); // copy
diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
index fa1d2389ddfcb..b6198edfc487c 100644
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@@ -679,53 +679,7 @@ static json oaicompat_chat_params_parse(
                 p["text"] = mtmd_default_marker();
                 p.erase("input_audio");
 
-            } else if (type == "video_url") {
-                if (!opt.allow_image) { // TODO: separate video flag?
-                    throw std::runtime_error("video input is not supported - hint: if this is unexpected, you may need to provide the mmproj");
-                }
-
-                json video_url  = json_value(p, "video_url", json::object());
-                std::string url = json_value(video_url, "url", std::string());
-                if (string_starts_with(url, "http")) {
-                    // download remote image
-                    // TODO @ngxson : maybe make these params configurable
-                    common_remote_params params;
-                    params.headers.push_back("User-Agent: llama.cpp/" + build_info);
-                    params.max_size = 1024 * 1024 * 100; // 100MB
-                    params.timeout  = 100; // seconds
-                    SRV_INF("downloading video from '%s'\n", url.c_str());
-                    auto res = common_remote_get_content(url, params);
-                    if (200 <= res.first && res.first < 300) {
-                        SRV_INF("downloaded %ld bytes\n", res.second.size());
-                        raw_buffer data;
-                        data.insert(data.end(), res.second.begin(), res.second.end());
-                        out_files.push_back(data);
-                    } else {
-                        throw std::runtime_error("Failed to download video");
-                    }
-
-                } else {
-                    // try to decode base64 video
-                    std::vector<std::string> parts = string_split<std::string>(url, /*separator*/ ',');
-                    if (parts.size() != 2) {
-                        throw std::runtime_error("Invalid video_url.url value");
-                    } else if (!string_starts_with(parts[0], "data:video/")) {
-                        throw std::runtime_error("Invalid video_url.url format: " + parts[0]);
-                    } else if (!string_ends_with(parts[0], "base64")) {
-                        throw std::runtime_error("video_url.url must be base64 encoded");
-                    } else {
-                        auto base64_data = parts[1];
-                        auto decoded_data = base64_decode(base64_data);
-                        out_files.push_back(decoded_data);
-                    }
-                }
-
-                // replace this chunk with a marker
-                p["type"] = "text";
-                p["text"] = mtmd_default_marker();
-                p.erase("video_url");
-
-            }else if (type != "text") {
+            } else if (type != "text") {
                 throw std::runtime_error("unsupported content[].type");
             }
         }
@@ -1460,7 +1414,7 @@ static server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt
     for (auto & file : files) {
         mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(mctx, file.data(), file.size()));
         if (!bmp.ptr) {
-            throw std::runtime_error("Failed to load media file");
+            throw std::runtime_error("Failed to load image or audio file");
         }
         // calculate bitmap hash (for KV caching)
         std::string hash = fnv_hash(bmp.data(), bmp.n_bytes());

From ef68f2a8bbf60e7d216ad2b199543217ae8880e5 Mon Sep 17 00:00:00 2001
From: qijia <2421653893@qq.com>
Date: Mon, 3 Nov 2025 15:28:43 +0800
Subject: [PATCH 12/13] refactor: modifications made based on reviews

---
 tools/mtmd/CMakeLists.txt  |   5 -
 tools/mtmd/clip.cpp        |   6 -
 tools/mtmd/clip.h          |   1 -
 tools/mtmd/mtmd-helper.cpp |  71 +---------
 tools/mtmd/mtmd-helper.h   |   9 +-
 tools/mtmd/mtmd-video.cpp  | 266 +++++++++++++++++++++++++++++--------
 tools/mtmd/mtmd.cpp        |  44 +++---
 tools/mtmd/mtmd.h          |   9 +-
 8 files changed, 242 insertions(+), 169 deletions(-)

diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt
index b4f10b882f96e..e135d4727f0a4 100644
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@@ -39,11 +39,6 @@ if (MTMD_WITH_FFMPEG)
     endif()
 endif()
 
-option(MTMD_MAX_VIDEO_FRAMES_SMALL "Set a small number of frames for fast test locally" OFF)
-if(MTMD_MAX_VIDEO_FRAMES_SMALL)
-    target_compile_definitions(mtmd PRIVATE MTMD_MAX_VIDEO_FRAMES_SMALL)
-endif()
-
 if (BUILD_SHARED_LIBS)
     set_target_properties     (mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
     target_compile_definitions(mtmd PRIVATE LLAMA_BUILD)
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index c3770adf90482..866bebeb16767 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -4842,12 +4842,6 @@ bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
         || ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL;
 }
 
-void clip_set_minicpmv_max_slice_nums(struct clip_ctx * ctx, int n) {
-    if (!ctx) return;
-    if (n < 0) n = 0;
-    ctx->model.hparams.minicpmv_max_slice_nums = n;
-}
-
 bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
     clip_image_f32 clip_img;
     clip_img.buf.resize(h * w * 3);
diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h
index c67bcc3a13c99..3387cdbd36955 100644
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@@ -91,7 +91,6 @@ bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_i
 bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
 
 int clip_is_minicpmv(const struct clip_ctx * ctx);
-void clip_set_minicpmv_max_slice_nums(struct clip_ctx * ctx, int n);
 bool clip_is_glm(const struct clip_ctx * ctx);
 bool clip_is_qwen2vl(const struct clip_ctx * ctx);
 bool clip_is_llava(const struct clip_ctx * ctx);
diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp
index 8f000be64e9d6..bdb9cc82faf2f 100644
--- a/tools/mtmd/mtmd-helper.cpp
+++ b/tools/mtmd/mtmd-helper.cpp
@@ -447,9 +447,11 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigne
 }
 
 mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * path) {
-    // although we could read the file into memory and call mtmd_helper_bitmap_init_from_buf,
+    // Attention! A directory containing frames images is also considered a video
+    // so path which is a directory should be handled by mtmd_video::init_video_bitmap
+    // Besides, although we could read the file into memory and call mtmd_helper_bitmap_init_from_buf,
     // but for video files, it's better to let ffmpeg read from file
-    if(mtmd_video::is_video_file(path) || mtmd_helper::is_dir(path)){
+    if(mtmd_video::is_video_file(path)){
         return mtmd_video::init_video_bitmap(ctx, path);
     }
 
@@ -475,68 +477,3 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char *
     delete [] buf;
     return res;
 }
-
-namespace mtmd_helper{
-
-bool has_image_ext(const std::string & name) {
-    auto lower = name;
-    std::transform(lower.begin(), lower.end(), lower.begin(), [](unsigned char c){ return (char)std::tolower(c); });
-    return lower.rfind(".jpg")  != std::string::npos ||
-           lower.rfind(".jpeg") != std::string::npos ||
-           lower.rfind(".png")  != std::string::npos ||
-           lower.rfind(".bmp")  != std::string::npos ||
-           lower.rfind(".gif")  != std::string::npos ||
-           lower.rfind(".webp") != std::string::npos;
-}
-
-bool is_dir(const std::string & path) {
-#if defined(_WIN32)
-    DWORD attrs = GetFileAttributesA(path.c_str());
-    return (attrs != INVALID_FILE_ATTRIBUTES) && (attrs & FILE_ATTRIBUTE_DIRECTORY);
-#else
-    struct stat st;
-    if (stat(path.c_str(), &st) != 0) return false;
-    return S_ISDIR(st.st_mode);
-#endif
-}
-
-void list_files(const std::string & dir, std::vector<std::string> & out, bool recursive) {
-#if defined(_WIN32)
-    std::string pattern = dir;
-    if (!pattern.empty() && pattern.back() != '/' && pattern.back() != '\\') pattern += "\\";
-    pattern += "*";
-    WIN32_FIND_DATAA ffd;
-    HANDLE hFind = FindFirstFileA(pattern.c_str(), &ffd);
-    if (hFind == INVALID_HANDLE_VALUE) return;
-    do {
-        std::string name = ffd.cFileName;
-        if (name == "." || name == "..") continue;
-        std::string path = dir;
-        if (!path.empty() && path.back() != '/' && path.back() != '\\') path += "\\";
-        path += name;
-        if (ffd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) {
-            if (recursive) list_files(path, out, recursive);
-        } else {
-            out.push_back(path);
-        }
-    } while (FindNextFileA(hFind, &ffd) != 0);
-    FindClose(hFind);
-#else
-    DIR * dp = opendir(dir.c_str());
-    if (!dp) return;
-    struct dirent * de;
-    while ((de = readdir(dp)) != nullptr) {
-        std::string name = de->d_name;
-        if (name == "." || name == "..") continue;
-        std::string path = dir + "/" + name;
-        if (is_dir(path)) {
-            if (recursive) list_files(path, out, recursive);
-        } else {
-            out.push_back(path);
-        }
-    }
-    closedir(dp);
-#endif
-}
-
-}
\ No newline at end of file
diff --git a/tools/mtmd/mtmd-helper.h b/tools/mtmd/mtmd-helper.h
index 73ad8c89b45c9..e9085e036da82 100644
--- a/tools/mtmd/mtmd-helper.h
+++ b/tools/mtmd/mtmd-helper.h
@@ -21,7 +21,8 @@ extern "C" {
 //
 
 // helper function to construct a mtmd_bitmap from a file
-// it calls mtmd_helper_bitmap_init_from_buf() internally
+// for image and audio, it calls mtmd_helper_bitmap_init_from_buf() internally
+// for video, it it calls init_video_bitmap() to reads and decodes and streams individual image frames to a bitmap
 // returns nullptr on failure
 // this function is thread-safe
 MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * path);
@@ -88,10 +89,4 @@ MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
 // C++ wrappers
 //
 
-namespace mtmd_helper{
-    bool has_image_ext(const std::string & name);
-    bool is_dir(const std::string & path);
-    void list_files(const std::string & dir, std::vector<std::string> & out, bool recursive);
-}
-
 #endif
diff --git a/tools/mtmd/mtmd-video.cpp b/tools/mtmd/mtmd-video.cpp
index 88892d389535f..cc5ee3a3567ca 100644
--- a/tools/mtmd/mtmd-video.cpp
+++ b/tools/mtmd/mtmd-video.cpp
@@ -1,5 +1,17 @@
+#include <cstdint>
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#   define NOMINMAX
+#endif
+#include <windows.h>
+#else
+#include <dirent.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#endif
+
 #include "mtmd-video.h"
-#include "clip-impl.h"
 #include "ggml.h"
 #include "mtmd-helper.h"
 
@@ -11,11 +23,123 @@
 #include <cstring>
 #include <memory>
 #include <cmath>
+#include <fstream>
+
+#define LOG_INF(...) fprintf(stdout, __VA_ARGS__)
+#define LOG_WRN(...) fprintf(stdout, __VA_ARGS__)
+#define LOG_ERR(...) fprintf(stderr, __VA_ARGS__)
+
+static bool is_image_file(const std::string & path) {
+    // check extension first
+    std::string lower = path;
+    std::transform(lower.begin(), lower.end(), lower.begin(),
+                   [](unsigned char c){ return (char)std::tolower(c); });
+
+    bool has_ext = lower.rfind(".jpg")  != std::string::npos ||
+                   lower.rfind(".jpeg") != std::string::npos ||
+                   lower.rfind(".png")  != std::string::npos ||
+                   lower.rfind(".bmp")  != std::string::npos ||
+                   lower.rfind(".gif")  != std::string::npos ||
+                   lower.rfind(".webp") != std::string::npos;
+
+    if (!has_ext) return false;
+
+    // check magic bytes then
+    std::ifstream f(path, std::ios::binary);
+    if (!f.is_open()) return false;
+
+    std::vector<unsigned char> header(16);
+    f.read(reinterpret_cast<char*>(header.data()), header.size());
+    size_t n = f.gcount();
+    if (n < 4) return false; // too small to be an image
+
+    // JPEG: FF D8 FF
+    if (n >= 3 && header[0] == 0xFF && header[1] == 0xD8 && header[2] == 0xFF)
+        return true;
+
+    // PNG: 89 50 4E 47 0D 0A 1A 0A
+    if (n >= 8 &&
+        header[0] == 0x89 && header[1] == 0x50 &&
+        header[2] == 0x4E && header[3] == 0x47 &&
+        header[4] == 0x0D && header[5] == 0x0A &&
+        header[6] == 0x1A && header[7] == 0x0A)
+        return true;
+
+    // BMP: 42 4D
+    if (n >= 2 && header[0] == 0x42 && header[1] == 0x4D)
+        return true;
+
+    // GIF: "GIF87a" or "GIF89a"
+    if (n >= 6 &&
+        header[0] == 'G' && header[1] == 'I' && header[2] == 'F' &&
+        header[3] == '8' &&
+        (header[4] == '7' || header[4] == '9') &&
+        header[5] == 'a')
+        return true;
+
+    // WEBP: "RIFF....WEBP"
+    if (n >= 12 &&
+        header[0] == 'R' && header[1] == 'I' && header[2] == 'F' && header[3] == 'F' &&
+        header[8] == 'W' && header[9] == 'E' && header[10] == 'B' && header[11] == 'P')
+        return true;
 
+    return false;
+}
+
+static bool is_dir(const std::string & path) {
+#if defined(_WIN32)
+    DWORD attrs = GetFileAttributesA(path.c_str());
+    return (attrs != INVALID_FILE_ATTRIBUTES) && (attrs & FILE_ATTRIBUTE_DIRECTORY);
+#else
+    struct stat st;
+    if (stat(path.c_str(), &st) != 0) return false;
+    return S_ISDIR(st.st_mode);
+#endif
+}
+
+static void list_files(const std::string & dir, std::vector<std::string> & out, bool recursive) {
+#if defined(_WIN32)
+    std::string pattern = dir;
+    if (!pattern.empty() && pattern.back() != '/' && pattern.back() != '\\') pattern += "\\";
+    pattern += "*";
+    WIN32_FIND_DATAA ffd;
+    HANDLE hFind = FindFirstFileA(pattern.c_str(), &ffd);
+    if (hFind == INVALID_HANDLE_VALUE) return;
+    do {
+        std::string name = ffd.cFileName;
+        if (name == "." || name == "..") continue;
+        std::string path = dir;
+        if (!path.empty() && path.back() != '/' && path.back() != '\\') path += "\\";
+        path += name;
+        if (ffd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) {
+            if (recursive) list_files(path, out, recursive);
+        } else {
+            out.push_back(path);
+        }
+    } while (FindNextFileA(hFind, &ffd) != 0);
+    FindClose(hFind);
+#else
+    DIR * dp = opendir(dir.c_str());
+    if (!dp) return;
+    struct dirent * de;
+    while ((de = readdir(dp)) != nullptr) {
+        std::string name = de->d_name;
+        if (name == "." || name == "..") continue;
+        std::string path = dir + "/" + name;
+        if (is_dir(path)) {
+            if (recursive) list_files(path, out, recursive);
+        } else {
+            out.push_back(path);
+        }
+    }
+    closedir(dp);
+#endif
+}
 
 namespace mtmd_video {
 
 bool is_video_file(const std::string & path){
+    if(is_dir(path)) return true; // a directory containing frames images is also considered a video
     auto lower = path;
     std::transform(lower.begin(), lower.end(), lower.begin(), [](unsigned char c){ return (char)std::tolower(c); });
     return lower.rfind(".mp4")  != std::string::npos ||
@@ -28,23 +152,22 @@ bool is_video_file(const std::string & path){
 static void get_video_info_from_dir(const std::string &path, VideoInfo &info){
     info.fps = 1; // do not care
     std::vector<std::string> files;
-    mtmd_helper::list_files(path, files, true); // recursive
+    list_files(path, files, true); // recursive
     info.total_frames = files.size();
 }
 // untested
 static mtmd_bitmap* load_frames_from_dir(mtmd_context * ctx,
                           const std::string & dir_path,
                           const LoadVideoOptions & opts) {
-    if (!ctx || dir_path.empty() || !mtmd_helper::is_dir(dir_path) || opts.max_frames < 1) {
+    if (!ctx || dir_path.empty() || !is_dir(dir_path) || opts.max_frames < 1) {
         return nullptr;
     }
     std::vector<std::string> files;
-    mtmd_helper::list_files(dir_path, files, opts.recursive);
+    list_files(dir_path, files, opts.recursive);
     std::sort(files.begin(), files.end());
 
     auto stride = std::max(1u, opts.stride);
     size_t loaded = 0;
-    unsigned char* dest = nullptr;
     mtmd_bitmap* out_frames = nullptr;
     const auto nframes = std::min(files.size() / stride, (size_t)opts.max_frames);
 
@@ -52,20 +175,16 @@ static mtmd_bitmap* load_frames_from_dir(mtmd_context * ctx,
     for (size_t i = 0; i < files.size(); i++) {
         if (i % stride != 0) continue;
         const std::string & f = files[i];
-        if (!mtmd_helper::has_image_ext(f)) continue;
+        if (!is_image_file(f)) continue;
         mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx, f.c_str()));
         if (!bmp.ptr) continue;
         if(loaded==0){
             w = bmp.nx();
             h = bmp.ny();
             out_frames = mtmd_bitmap_init_from_video(w, h, nframes, nullptr);
-            dest = mtmd_bitmap_get_data_mutable(out_frames);
         }
         GGML_ASSERT(bmp.nx() == w && bmp.ny() == h); // all frames must have the same size
-        std::memcpy(dest,
-                    bmp.data(),
-                    bmp.n_bytes());
-        dest += bmp.n_bytes();
+        mtmd_bitmap_set_frame(out_frames, loaded, bmp.data());
         loaded++;
         if (loaded >= nframes) break;
     }
@@ -73,10 +192,14 @@ static mtmd_bitmap* load_frames_from_dir(mtmd_context * ctx,
     return out_frames;
 }
 
-struct DecodedFrameRGBA {
+// This Frames Queue is useful for streaming use case, 
+// This frame queue is designed for streaming scenarios,
+// where an FFmpeg-related thread acts as the producer generating frames,
+// and the main program (the LLM inference thread) acts as the consumer processing them.
+struct DecodedFramesQueue {
     int width;
     int height;
-    std::vector<unsigned char> rgba; // size = width * height * 4
+    std::queue<std::vector<unsigned char>> frames; // frame size = width * height * 3
 };
 
 static mtmd_video::LoadVideoOptions get_video_sample_options(mtmd_video::VideoInfo info){
@@ -86,13 +209,12 @@ static mtmd_video::LoadVideoOptions get_video_sample_options(mtmd_video::VideoIn
     opts.recursive  = false;
 
     /* MiniCPM-V normal-speed video frames sample method */
-
-#ifdef MTMD_MAX_VIDEO_FRAMES_SMALL
-    // set a small number of frames for fast test locally
-    const int32_t minicpmv_max_video_frames = 4;
-#else
-    const int32_t minicpmv_max_video_frames = 64;
-#endif
+    bool debug_use_small_frames_for_video = std::getenv("MTMD_MAX_VIDEO_FRAMES_SMALL") != nullptr;
+    int32_t minicpmv_max_video_frames = 64;
+    if(debug_use_small_frames_for_video){
+        // set a small number of frames for fast test locally
+        minicpmv_max_video_frames = 4;
+    }
     opts.max_frames = minicpmv_max_video_frames;
     if(info.total_frames > minicpmv_max_video_frames) {
         // uniform sample
@@ -114,6 +236,7 @@ extern "C" {
 #include <libavformat/avformat.h>
 #include <libavcodec/avcodec.h>
 #include <libswscale/swscale.h>
+#include <libavutil/imgutils.h>
 }
 #pragma GCC diagnostic pop
 
@@ -338,23 +461,35 @@ static bool get_video_info_from_format_ctx(AVFormatContext *fmt, VideoInfo &info
 
 static bool decode_video_ffmpeg_to_rgba_from_format_ctx(
     AVFormatContext* fmt,
-    std::vector<DecodedFrameRGBA>& frames,
+    DecodedFramesQueue& out_q,
     mtmd_video::LoadVideoOptions opts) 
 {
     const auto stride = opts.stride;
     const auto max_frames = opts.max_frames;
     if(!fmt || stride <= 0 || max_frames <= 0) return false;
-    if (avformat_find_stream_info(fmt, nullptr) < 0) return false;
+    // vstream checked before in get_video_info_from_format_ctx
     int vstream = av_find_best_stream(fmt, AVMEDIA_TYPE_VIDEO, -1, -1, nullptr, 0);
     if (vstream < 0) return false;
     AVStream * st = fmt->streams[vstream];
     const AVCodec * dec = avcodec_find_decoder(st->codecpar->codec_id);
-    if (!dec) return false;
+    if (!dec){
+        LOG_ERR("%s: Could not find decoder for codec id %d\n", __func__, st->codecpar->codec_id);
+        return false;
+    }
     AVCodecContext * ctx = avcodec_alloc_context3(dec);
-    if (!ctx) return false;
+    if (!ctx){
+        LOG_ERR("%s: Could not allocate codec context\n", __func__);
+        return false;
+    }
     std::unique_ptr<AVCodecContext, void(*)(AVCodecContext*)> ctx_guard(ctx, [](AVCodecContext *c){ if (c) avcodec_free_context(&c); });
-    if (avcodec_parameters_to_context(ctx, st->codecpar) < 0) return false;
-    if (avcodec_open2(ctx, dec, nullptr) < 0) return false;
+    if (avcodec_parameters_to_context(ctx, st->codecpar) < 0){
+        LOG_ERR("%s: Could not copy codec parameters to codec context\n", __func__);
+        return false;
+    }
+    if (avcodec_open2(ctx, dec, nullptr) < 0){
+        LOG_ERR("%s: Could not open codec2\n", __func__);
+        return false;
+    }
 
     AVFrame * frame = av_frame_alloc();
     AVPacket * pkt  = av_packet_alloc();
@@ -364,56 +499,66 @@ static bool decode_video_ffmpeg_to_rgba_from_format_ctx(
     SwsContext * sws = nullptr;
     uint32_t idx = 0;
     uint32_t taken = 0;
+    const int w = ctx->width;
+    const int h = ctx->height;
+
+    if(!out_q.frames.empty() && (out_q.height!=h || out_q.width!=w)){
+        LOG_ERR("Different size frames remain in decoded frames queue");
+        return false;
+    }
+    out_q.width = w;
+    out_q.height = h;
+
     while (av_read_frame(fmt, pkt) >= 0) {
         if (pkt->stream_index != vstream) { av_packet_unref(pkt); continue; }
         if (avcodec_send_packet(ctx, pkt) < 0) { av_packet_unref(pkt); break; }
         av_packet_unref(pkt);
         while (avcodec_receive_frame(ctx, frame) == 0) {
-            if (idx++ % stride != stride/2) continue;
+            if (idx++ % stride != stride/2) { av_frame_unref(frame); continue; }
             if (!sws) {
                 sws = sws_getContext(frame->width, frame->height, (AVPixelFormat)frame->format,
-                                     frame->width, frame->height, AV_PIX_FMT_RGBA,
-                                     SWS_BILINEAR, nullptr, nullptr, nullptr);
-                if (!sws) return false;
+                                        frame->width, frame->height, AV_PIX_FMT_RGB24,
+                                        SWS_BILINEAR, nullptr, nullptr, nullptr);
+                if (frame->format == AV_PIX_FMT_NONE) { av_frame_unref(frame); continue; }
+                if (!sws){
+                    LOG_ERR("%s: Could not initialize sws context\n", __func__);
+                    av_frame_unref(frame);
+                    return false;
+                }
             }
-            DecodedFrameRGBA out;
-            out.width = frame->width;
-            out.height = frame->height;
-            out.rgba.resize((size_t)frame->width * frame->height * 4);
-            uint8_t * dst_data[4] = { out.rgba.data(), nullptr, nullptr, nullptr };
-            int dst_linesize[4] = { frame->width * 4, 0, 0, 0 };
+
+            GGML_ASSERT(w==frame->width && h==frame->height);
+
+            std::vector<uint8_t> out;
+            out.resize((size_t)frame->width * frame->height * 3);
+
+            uint8_t *dst_data[4] = { out.data(), nullptr, nullptr, nullptr };
+            int dst_linesize[4] = { frame->width * 3, 0, 0, 0 };
             sws_scale(sws, frame->data, frame->linesize, 0, frame->height, dst_data, dst_linesize);
-            frames.push_back(std::move(out));
+        
+            out_q.frames.emplace(out);
             taken++;
-            if (taken >= max_frames) break;
+            av_frame_unref(frame);
+            if (max_frames > 0 && taken >= max_frames) break;
         }
     }
     if (sws) sws_freeContext(sws);
     return taken > 0;
 }
 
-static mtmd_bitmap* convert_frames_to_bitmap(mtmd_context * ctx, const std::vector<DecodedFrameRGBA>& decoded) {
+static mtmd_bitmap* convert_frames_to_bitmap(mtmd_context * ctx, DecodedFramesQueue& decoded) {
     if (!ctx) return nullptr;
-    if(decoded.empty()) return nullptr;
-    const size_t nframes = decoded.size();
+    if(decoded.frames.empty()) return nullptr;
+    const size_t nframes = decoded.frames.size();
     if(nframes < 1){
         return nullptr;
     }
-    const int w = decoded[0].width;
-    const int h = decoded[0].height;
+    const int w = decoded.width;
+    const int h = decoded.height;
     mtmd_bitmap* out_frames = mtmd_bitmap_init_from_video(uint32_t(w), uint32_t(h), uint32_t(nframes), nullptr);
-    unsigned char * dst = mtmd_bitmap_get_data_mutable(out_frames);
-
-    for (const auto & fr : decoded) {
-        GGML_ASSERT(w == fr.width && h == fr.height);
-        const unsigned char * src = fr.rgba.data();
-        for (int i = 0; i < w * h; ++i) {
-            dst[0] = src[0]; // R
-            dst[1] = src[1]; // G
-            dst[2] = src[2]; // B
-            dst += 3;
-            src += 4; // skip A
-        }
+    for(size_t i=0;i<nframes;++i){
+        mtmd_bitmap_set_frame(out_frames, i, decoded.frames.front().data());
+        decoded.frames.pop();
     }
 
     return out_frames;
@@ -433,7 +578,7 @@ mtmd_bitmap* init_video_bitmap(mtmd_context * ctx, const uint8_t* buffer, size_t
 
     const auto opts = get_video_sample_options(info);
 
-    std::vector<DecodedFrameRGBA> frames;
+    DecodedFramesQueue frames;
     if(!decode_video_ffmpeg_to_rgba_from_format_ctx(fmt, frames, opts)){
         LOG_ERR("Unable to decode video from buffer\n");
         free_format_context_from_buffer(fmt, avio_ctx);
@@ -458,7 +603,7 @@ mtmd_bitmap* init_video_bitmap(mtmd_context * /*ctx*/, const uint8_t* /*buffer*/
 mtmd_bitmap* init_video_bitmap(mtmd_context * ctx, const std::string & path) {
     auto info = mtmd_video::VideoInfo{};
     
-    if(mtmd_helper::is_dir(path)){
+    if(is_dir(path)){
         get_video_info_from_dir(path, info);
         const auto opts = get_video_sample_options(info);
         return load_frames_from_dir(ctx, path, opts);
@@ -468,12 +613,13 @@ mtmd_bitmap* init_video_bitmap(mtmd_context * ctx, const std::string & path) {
 
     #ifdef MTMD_WITH_FFMPEG
     AVFormatContext* fmt = nullptr;
-
+    LOG_INF("Loading video from path: %s\n", path.c_str());
     if (avformat_open_input(&fmt, path.c_str(), nullptr, nullptr) < 0){
         LOG_ERR("Unable to open video from path: %s\n", path.c_str());
         if(fmt) avformat_close_input(&fmt);
         return nullptr;
     }
+    LOG_INF("get_video_info_from_format_ctx...\n");
     if(!get_video_info_from_format_ctx(fmt, info)) {
         LOG_ERR("Unable to get video info from path: %s\n", path.c_str());
         if(fmt) avformat_close_input(&fmt);
@@ -482,7 +628,9 @@ mtmd_bitmap* init_video_bitmap(mtmd_context * ctx, const std::string & path) {
 
     const auto opts = get_video_sample_options(info);
 
-    std::vector<DecodedFrameRGBA> frames;
+    LOG_INF("decode_video_ffmpeg_to_rgba_from_format_ctx...\n");
+
+    DecodedFramesQueue frames;
     if(!decode_video_ffmpeg_to_rgba_from_format_ctx(fmt, frames, opts)){
         LOG_ERR("Unable to decode video from path: %s\n", path.c_str());
         if(fmt) avformat_close_input(&fmt);
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index aeb8cf5f1a64a..07dd41f564ec7 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -23,11 +23,11 @@
 #include <limits>
 #include <vector>
 
-enum class mtmd_bitmap_type {
-    IMAGE,
-    AUDIO,
-    VIDEO,
-}; 
+enum mtmd_bitmap_type {
+    MTMD_BITMAP_TYPE_IMAGE,
+    MTMD_BITMAP_TYPE_AUDIO,
+    MTMD_BITMAP_TYPE_VIDEO,
+};
 
 // if bitmap is image:
 //     length of data must be nx * ny * 3
@@ -42,7 +42,7 @@ struct mtmd_bitmap {
     mtmd_bitmap_type type;
     uint32_t nx;
     uint32_t ny; // for audio, ny=1
-    uint32_t nz; // for video: number of frames
+    uint32_t nz; // for video: number of frames, always 1 for image and audio
     std::vector<unsigned char> data;
     std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
 };
@@ -907,10 +907,6 @@ int mtmd_get_audio_bitrate(mtmd_context * ctx) {
     return 16000; // 16kHz
 }
 
-void mtmd_set_minicpmv_max_slice_nums(mtmd_context * ctx, int n) {
-    if (!ctx || !ctx->ctx_v) return;
-    clip_set_minicpmv_max_slice_nums(ctx->ctx_v, n);
-}
 
 //
 // public API functions
@@ -925,7 +921,7 @@ mtmd_bitmap * mtmd_bitmap_init(uint32_t nx,
     bitmap->nx = nx;
     bitmap->ny = ny;
     bitmap->nz = 1;
-    bitmap->type = mtmd_bitmap_type::IMAGE;
+    bitmap->type = MTMD_BITMAP_TYPE_IMAGE;
     size_t data_size = (size_t)nx * ny * 3;
     bitmap->data.resize(data_size);
     std::memcpy(bitmap->data.data(), data, data_size);
@@ -938,7 +934,7 @@ mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples,
     bitmap->nx = n_samples;
     bitmap->ny = 1;
     bitmap->nz = 1;
-    bitmap->type = mtmd_bitmap_type::AUDIO;
+    bitmap->type = MTMD_BITMAP_TYPE_AUDIO;
     size_t data_size = n_samples * sizeof(float);
     bitmap->data.resize(data_size);
     std::memcpy(bitmap->data.data(), data, data_size);
@@ -950,7 +946,7 @@ mtmd_bitmap * mtmd_bitmap_init_from_video(uint32_t nx, uint32_t ny, uint32_t nfr
     bitmap->nx = nx;
     bitmap->ny = ny;
     bitmap->nz = nframes;
-    bitmap->type = mtmd_bitmap_type::VIDEO;
+    bitmap->type = MTMD_BITMAP_TYPE_VIDEO;
     size_t data_size = (size_t)nx * ny * nframes * 3;
     bitmap->data.resize(data_size);
     if(data != nullptr){
@@ -972,20 +968,16 @@ const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap) {
     return bitmap->data.data();
 }
 
-unsigned char * mtmd_bitmap_get_data_mutable(mtmd_bitmap * bitmap) {
-    return bitmap->data.data();
-}
-
 size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap) {
     return bitmap->data.size();
 }
 
 bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap) {
-    return bitmap->type == mtmd_bitmap_type::AUDIO;
+    return bitmap->type == MTMD_BITMAP_TYPE_AUDIO;
 }
 
 bool mtmd_bitmap_is_video(const mtmd_bitmap * bitmap) {
-    return bitmap->type == mtmd_bitmap_type::VIDEO;
+    return bitmap->type == MTMD_BITMAP_TYPE_VIDEO;
 }
 
 const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap) {
@@ -1006,6 +998,20 @@ void mtmd_bitmap_free(mtmd_bitmap * bitmap) {
     }
 }
 
+bool mtmd_bitmap_set_frame(mtmd_bitmap * bitmap, size_t i, const unsigned char * data) {
+    if (bitmap->type != MTMD_BITMAP_TYPE_VIDEO) {
+        LOG_ERR("%s: error: bitmap is not a video\n", __func__);
+        return false;
+    }
+    if (i >= (size_t)bitmap->nz) {
+        LOG_ERR("%s: error: frame index %zu out of range (nz = %u)\n", __func__, i, bitmap->nz);
+        return false;
+    }
+    size_t frame_size = (size_t)bitmap->nx * bitmap->ny * 3;
+    std::memcpy(bitmap->data.data() + i * frame_size, data, frame_size);
+    return true;
+}
+
 // mtmd_input_chunks
 
 mtmd_input_chunks * mtmd_input_chunks_init() {
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
index 9fb5d560c7530..b5afcebff2cb2 100644
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@@ -112,10 +112,6 @@ MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
 // return -1 if audio is not supported
 MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx);
 
-// set MiniCPM-V UHD slicing upper bound (used when preprocessing images for MiniCPM-V)
-// values < 1 will be clamped to 1
-MTMD_API void mtmd_set_minicpmv_max_slice_nums(mtmd_context * ctx, int n);
-
 // mtmd_bitmap
 //
 // if bitmap is image:
@@ -127,17 +123,20 @@ MTMD_API void mtmd_set_minicpmv_max_slice_nums(mtmd_context * ctx, int n);
 // if bitmap is video:
 //     length of data must be nx * ny * nframes * 3
 //     the data is in RGBRGBRGB... format for each frame, frames are stored sequentially
+//     if data is nullptr, an empty bitmap is created (you can init content later)
 MTMD_API mtmd_bitmap *         mtmd_bitmap_init           (uint32_t nx, uint32_t ny, const unsigned char * data);
 MTMD_API mtmd_bitmap *         mtmd_bitmap_init_from_audio(size_t n_samples,         const float         * data);
 MTMD_API mtmd_bitmap *         mtmd_bitmap_init_from_video(uint32_t nx, uint32_t ny, uint32_t nframes, const unsigned char * data);
 MTMD_API uint32_t              mtmd_bitmap_get_nx     (const mtmd_bitmap * bitmap);
 MTMD_API uint32_t              mtmd_bitmap_get_ny     (const mtmd_bitmap * bitmap);
 MTMD_API const unsigned char * mtmd_bitmap_get_data   (const mtmd_bitmap * bitmap);
-MTMD_API unsigned char * mtmd_bitmap_get_data_mutable (mtmd_bitmap * bitmap);
 MTMD_API size_t                mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap);
 MTMD_API bool                  mtmd_bitmap_is_audio   (const mtmd_bitmap * bitmap);
 MTMD_API bool                  mtmd_bitmap_is_video   (const mtmd_bitmap * bitmap);
 MTMD_API void                  mtmd_bitmap_free       (mtmd_bitmap * bitmap);
+// set single frame i of the video bitmap
+// return false if bitmap is not video or i is out of range
+MTMD_API bool                  mtmd_bitmap_set_frame  (mtmd_bitmap * bitmap, size_t i, const unsigned char * data);
 // bitmap ID is optional, but useful for KV cache tracking
 // these getters/setters are dedicated functions, so you can for example calculate the hash of the image based on mtmd_bitmap_get_data()
 MTMD_API const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap);

From 9396d6ee194b21c9e79d6164d93b73b560f4c674 Mon Sep 17 00:00:00 2001
From: qijia <2421653893@qq.com>
Date: Mon, 3 Nov 2025 19:08:57 +0800
Subject: [PATCH 13/13] fix: add queue header file

---
 tools/mtmd/mtmd-video.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/mtmd/mtmd-video.cpp b/tools/mtmd/mtmd-video.cpp
index cc5ee3a3567ca..503d9058487c7 100644
--- a/tools/mtmd/mtmd-video.cpp
+++ b/tools/mtmd/mtmd-video.cpp
@@ -24,6 +24,7 @@
 #include <memory>
 #include <cmath>
 #include <fstream>
+#include <queue>
 
 #define LOG_INF(...) fprintf(stdout, __VA_ARGS__)
 #define LOG_WRN(...) fprintf(stdout, __VA_ARGS__)