diff --git a/.clang-tidy b/.clang-tidy
index 803b8b46a32f3..41f5f5ce626a5 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -14,6 +14,8 @@ Checks: >
     -readability-uppercase-literal-suffix,
     -readability-simplify-boolean-expr,
     -readability-math-missing-parentheses,
+    -readability-braces-around-statements,
+    -readability-isolate-declaration,
     clang-analyzer-*,
     -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
     performance-*,
diff --git a/common/arg.cpp b/common/arg.cpp
index 4316917d74595..4bb04cc9902a1 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2768,6 +2768,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.image.emplace_back(value);
         }
     ).set_examples({LLAMA_EXAMPLE_MTMD}));
+    add_opt(common_arg(
+        {"--video"}, "PATH",
+        "path to a video file (requires FFmpeg at build time) or a directory of frames; can be repeated.\n",
+        [](common_params & params, const std::string & value) {
+            params.video.emplace_back(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_MTMD}));
     add_opt(common_arg(
         {"--image-min-tokens"}, "N",
         "minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
diff --git a/common/common.h b/common/common.h
index 78c568a7bc62e..37bc2e80e5b8f 100644
--- a/common/common.h
+++ b/common/common.h
@@ -406,6 +406,7 @@ struct common_params {
     bool mmproj_use_gpu = true;     // use GPU for multimodal model
     bool no_mmproj = false;         // explicitly disable multimodal model
     std::vector<std::string> image; // path to image file(s)
+    std::vector<std::string> video; // path to video file(s) or frame directories
     int image_min_tokens = -1;
     int image_max_tokens = -1;
 
diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt
index 2381012a0d02f..e135d4727f0a4 100644
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@@ -5,12 +5,14 @@ find_package(Threads REQUIRED)
 add_library(mtmd
             mtmd.cpp
             mtmd-audio.cpp
+            mtmd-video.cpp
             mtmd.h
             clip.cpp
             clip.h
             clip-impl.h
             mtmd-helper.cpp
             mtmd-helper.h
+            mtmd-video.h
             )
 
 target_link_libraries     (mtmd PUBLIC ggml llama)
@@ -20,6 +22,23 @@ target_include_directories(mtmd PRIVATE ../..)
 target_include_directories(mtmd PRIVATE ../../vendor)
 target_compile_features   (mtmd PRIVATE cxx_std_17)
 
+# Optional FFmpeg support for video decoding
+option(MTMD_WITH_FFMPEG "Enable FFmpeg-based video decoding in mtmd-video" OFF)
+if (MTMD_WITH_FFMPEG)
+    find_package(PkgConfig QUIET)
+    if (PKG_CONFIG_FOUND)
+        pkg_check_modules(FFMPEG QUIET IMPORTED_TARGET libavformat libavcodec libswscale libavutil)
+        if (FFMPEG_FOUND)
+            target_link_libraries(mtmd PRIVATE PkgConfig::FFMPEG)
+            target_compile_definitions(mtmd PRIVATE MTMD_WITH_FFMPEG)
+        else()
+            message(WARNING "FFmpeg not found via pkg-config; MTMD_WITH_FFMPEG disabled")
+        endif()
+    else()
+        message(WARNING "pkg-config not found; MTMD_WITH_FFMPEG disabled")
+    endif()
+endif()
+
 if (BUILD_SHARED_LIBS)
     set_target_properties     (mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
     target_compile_definitions(mtmd PRIVATE LLAMA_BUILD)
@@ -29,6 +48,7 @@ endif()
 set(MTMD_PUBLIC_HEADERS
     ${CMAKE_CURRENT_SOURCE_DIR}/mtmd.h
     ${CMAKE_CURRENT_SOURCE_DIR}/mtmd-helper.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/mtmd-video.h
     )
 
 set_target_properties(mtmd
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 99775cb3e351c..ac7247dd08bdb 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -201,6 +201,7 @@ struct clip_hparams {
     // legacy
     bool has_llava_projector = false;
     int minicpmv_version = 0;
+    int minicpmv_max_slice_nums = 9;
     int32_t minicpmv_query_num = 0;         // MiniCPM-V query number
 
     // custom value provided by user, can be undefined if not set
@@ -3911,16 +3912,67 @@ struct llava_uhd {
         const bool has_slices    = original_size.width > slice_size || original_size.height > slice_size;
         const bool has_pinpoints = !ctx->model.hparams.image_res_candidates.empty();
 
-        if (!has_slices) {
-            // skip slicing logic
-            res.overview_size = clip_image_size{slice_size, slice_size};
-            res.refined_size  = clip_image_size{0, 0};
-            res.grid_size     = clip_image_size{0, 0};
+        if (clip_is_minicpmv(ctx)) {
+            auto best_size    = get_best_resize(original_size, slice_size, patch_size, !has_slices);
+            res.overview_size = best_size;
+
+            {
+                const int max_slice_nums = ctx->model.hparams.minicpmv_max_slice_nums;
+                const float log_ratio = log((float)original_width / original_height);
+                const float ratio = (float)original_width * original_height / (slice_size * slice_size);
+                const int multiple = fmin(ceil(ratio), max_slice_nums);
+
+                auto best_grid   = get_best_grid(max_slice_nums, multiple, log_ratio);
+                auto refine_size = get_refine_size(original_size, best_grid, slice_size, patch_size, true);
+                res.grid_size    = best_grid;
+                res.refined_size = refine_size;
+
+                LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n",
+                        __func__, original_width, original_height,
+                        res.overview_size.width, res.overview_size.height,
+                        res.refined_size.width, res.refined_size.height,
+                        res.grid_size.width, res.grid_size.height);
+
+                if (!has_slices || max_slice_nums == 0) {
+                    return res;
+                }
+
+                int width  = refine_size.width;
+                int height = refine_size.height;
+                int grid_x = int(width  / best_grid.width);
+                int grid_y = int(height / best_grid.height);
+                for (int patches_y = 0,                    ic = 0;
+                        patches_y < refine_size.height && ic < best_grid.height;
+                        patches_y += grid_y,              ic += 1) {
+                    for (int patches_x = 0,                   jc = 0;
+                            patches_x < refine_size.width && jc < best_grid.width;
+                            patches_x += grid_x,             jc += 1) {
+                        slice_coordinates slice;
+                        slice.x = patches_x;
+                        slice.y = patches_y;
+                        slice.size.width  = grid_x;
+                        slice.size.height = grid_y;
+                        res.slices.push_back(slice);
+                        LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n",
+                                __func__, (int)res.slices.size() - 1,
+                                slice.x, slice.y, slice.size.width, slice.size.height);
+                    }
+                }
+            }
 
             return res;
         }
+        else {
+            if (!has_slices) {
+                // skip slicing logic
+                res.overview_size = clip_image_size{slice_size, slice_size};
+                res.refined_size  = clip_image_size{0, 0};
+                res.grid_size     = clip_image_size{0, 0};
+
+                return res;
+            }
 
-        if (has_pinpoints) {
+            if (has_pinpoints) {
             // has pinpoints, use them to calculate the grid size (e.g. llava-1.6)
             auto refine_size = llava_uhd::select_best_resolution(
                 original_size,
@@ -3956,53 +4008,7 @@ struct llava_uhd {
 
             return res;
         }
-
-        // no pinpoints, dynamically calculate the grid size (e.g. minicpmv)
-
-        auto best_size    = get_best_resize(original_size, slice_size, patch_size, !has_slices);
-        res.overview_size = best_size;
-
-        {
-            const int max_slice_nums = 9; // TODO: this is only used by minicpmv, maybe remove it
-            const float log_ratio = log((float)original_width / original_height);
-            const float ratio = (float)original_width * original_height / (slice_size * slice_size);
-            const int multiple = fmin(ceil(ratio), max_slice_nums);
-
-            auto best_grid   = get_best_grid(max_slice_nums, multiple, log_ratio);
-            auto refine_size = get_refine_size(original_size, best_grid, slice_size, patch_size, true);
-            res.grid_size    = best_grid;
-            res.refined_size = refine_size;
-
-            LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n",
-                    __func__, original_width, original_height,
-                    res.overview_size.width, res.overview_size.height,
-                    res.refined_size.width, res.refined_size.height,
-                    res.grid_size.width, res.grid_size.height);
-
-            int width  = refine_size.width;
-            int height = refine_size.height;
-            int grid_x = int(width  / best_grid.width);
-            int grid_y = int(height / best_grid.height);
-            for (int patches_y = 0,                    ic = 0;
-                    patches_y < refine_size.height && ic < best_grid.height;
-                    patches_y += grid_y,              ic += 1) {
-                for (int patches_x = 0,                   jc = 0;
-                        patches_x < refine_size.width && jc < best_grid.width;
-                        patches_x += grid_x,             jc += 1) {
-                    slice_coordinates slice;
-                    slice.x = patches_x;
-                    slice.y = patches_y;
-                    slice.size.width  = grid_x;
-                    slice.size.height = grid_y;
-                    res.slices.push_back(slice);
-                    LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n",
-                            __func__, (int)res.slices.size() - 1,
-                            slice.x, slice.y, slice.size.width, slice.size.height);
-                }
-            }
         }
-
-        return res;
     }
 
     static std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 * img, const slice_instructions & inst) {
diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp
index 3e19e95958a2f..4b31061ef0ab8 100644
--- a/tools/mtmd/mtmd-cli.cpp
+++ b/tools/mtmd/mtmd-cli.cpp
@@ -6,12 +6,15 @@
 #include "ggml.h"
 #include "console.h"
 #include "chat.h"
+#include "clip.h"
 #include "mtmd.h"
 #include "mtmd-helper.h"
+#include "mtmd-video.h"
 
 #include <vector>
 #include <limits.h>
 #include <cinttypes>
+#include <cstdlib>
 
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
 #include <signal.h>
@@ -157,8 +160,8 @@ struct mtmd_cli_context {
         );
     }
 
-    bool load_media(const std::string & fname) {
-        mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str()));
+    bool load_media(const std::string & path) {
+        mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), path.c_str()));
         if (!bmp.ptr) {
             return false;
         }
@@ -287,7 +290,7 @@ int main(int argc, char ** argv) {
     mtmd_cli_context ctx(params);
     LOG("%s: loading model: %s\n", __func__, params.model.path.c_str());
 
-    bool is_single_turn = !params.prompt.empty() && !params.image.empty();
+    bool is_single_turn = !params.prompt.empty() && (!params.image.empty() || !params.video.empty());
 
     int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict;
 
@@ -311,19 +314,34 @@ int main(int argc, char ** argv) {
 
     if (is_single_turn) {
         g_is_generating = true;
-        if (params.prompt.find(mtmd_default_marker()) == std::string::npos) {
-            for (size_t i = 0; i < params.image.size(); i++) {
-                params.prompt += mtmd_default_marker();
-            }
-        }
-        common_chat_msg msg;
-        msg.role = "user";
-        msg.content = params.prompt;
+
+        // 1) load all media first
+        size_t n_loaded_media = 0;
         for (const auto & image : params.image) {
             if (!ctx.load_media(image)) {
                 return 1; // error is already printed by libmtmd
             }
+            n_loaded_media += 1;
+        }
+        for (const auto & vpath : params.video) {
+            if (!ctx.load_media(vpath)) {
+                return 1; // error is already printed by libmtmd
+            }
+            n_loaded_media += 1;
         }
+
+        // 2) build prompt content with correct number of markers
+        std::string prompt_content = params.prompt;
+        if (prompt_content.find(mtmd_default_marker()) == std::string::npos) {
+            for (size_t i = 0; i < n_loaded_media; i++) {
+                prompt_content += mtmd_default_marker();
+            }
+        }
+
+        // 3) run
+        common_chat_msg msg;
+        msg.role = "user";
+        msg.content = prompt_content;
         if (eval_message(ctx, msg)) {
             return 1;
         }
@@ -339,6 +357,9 @@ int main(int argc, char ** argv) {
         if (mtmd_support_audio(ctx.ctx_vision.get())) {
             LOG("\n   /audio <path>    load an audio");
         }
+        if (mtmd_support_vision(ctx.ctx_vision.get())) {
+            LOG("\n   /video <path>     load a video");
+        }
         LOG("\n   /clear           clear the chat history");
         LOG("\n   /quit or /exit   exit the program");
         LOG("\n");
@@ -370,14 +391,15 @@ int main(int argc, char ** argv) {
             g_is_generating = true;
             bool is_image = line == "/image" || line.find("/image ") == 0;
             bool is_audio = line == "/audio" || line.find("/audio ") == 0;
-            if (is_image || is_audio) {
+            bool is_video = line == "/video" || line.find("/video ") == 0;
+            if (is_image || is_audio || is_video) {
                 if (line.size() < 8) {
                     LOG_ERR("ERR: Missing media filename\n");
                     continue;
                 }
                 std::string media_path = line.substr(7);
                 if (ctx.load_media(media_path)) {
-                    LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : "audio");
+                    LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : (is_audio ? "audio" : "video"));
                     content += mtmd_default_marker();
                 }
                 // else, error is already printed by libmtmd
diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp
index 686f42f3960fe..bdb9cc82faf2f 100644
--- a/tools/mtmd/mtmd-helper.cpp
+++ b/tools/mtmd/mtmd-helper.cpp
@@ -5,12 +5,18 @@
 #   define NOMINMAX
 #endif
 #include <windows.h>
+#else
+#include <dirent.h>
+#include <sys/stat.h>
+#include <sys/types.h>
 #endif
 
 #include "mtmd.h"
 #include "mtmd-helper.h"
 #include "llama.h"
 
+#include "mtmd-video.h"
+
 #include <algorithm>
 #include <cinttypes>
 #include <vector>
@@ -421,6 +427,10 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigne
         return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data());
     }
 
+    if(mtmd_video::is_video_buffer(buf, len)) {
+        return mtmd_video::init_video_bitmap(ctx, buf, len);
+    }
+
     // otherwise, we assume it's an image
     mtmd_bitmap * result = nullptr;
     {
@@ -436,25 +446,34 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigne
     return result;
 }
 
-mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname) {
-    std::vector<unsigned char> buf;
-    FILE * f = fopen(fname, "rb");
+mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * path) {
+    // Attention! A directory containing frames images is also considered a video
+    // so path which is a directory should be handled by mtmd_video::init_video_bitmap
+    // Besides, although we could read the file into memory and call mtmd_helper_bitmap_init_from_buf,
+    // but for video files, it's better to let ffmpeg read from file
+    if(mtmd_video::is_video_file(path)){
+        return mtmd_video::init_video_bitmap(ctx, path);
+    }
+
+    FILE * f = fopen(path, "rb");
     if (!f) {
-        LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
+        LOG_ERR("Unable to open path %s: %s\n", path, strerror(errno));
         return nullptr;
     }
 
     fseek(f, 0, SEEK_END);
     long file_size = ftell(f);
     fseek(f, 0, SEEK_SET);
-    buf.resize(file_size);
+    auto * buf = new unsigned char[file_size]; 
 
-    size_t n_read = fread(buf.data(), 1, file_size, f);
+    size_t n_read = fread(buf, 1, file_size, f);
     fclose(f);
     if (n_read != (size_t)file_size) {
-        LOG_ERR("Failed to read entire file %s", fname);
+        LOG_ERR("Failed to read entire path %s", path);
         return nullptr;
     }
 
-    return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size());
+    auto * res = mtmd_helper_bitmap_init_from_buf(ctx, buf, file_size);
+    delete [] buf;
+    return res;
 }
diff --git a/tools/mtmd/mtmd-helper.h b/tools/mtmd/mtmd-helper.h
index 5c0edc6937eee..e9085e036da82 100644
--- a/tools/mtmd/mtmd-helper.h
+++ b/tools/mtmd/mtmd-helper.h
@@ -21,10 +21,11 @@ extern "C" {
 //
 
 // helper function to construct a mtmd_bitmap from a file
-// it calls mtmd_helper_bitmap_init_from_buf() internally
+// for image and audio, it calls mtmd_helper_bitmap_init_from_buf() internally
+// for video, it it calls init_video_bitmap() to reads and decodes and streams individual image frames to a bitmap
 // returns nullptr on failure
 // this function is thread-safe
-MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname);
+MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * path);
 
 // helper function to construct a mtmd_bitmap from a buffer containing a file
 // supported formats:
diff --git a/tools/mtmd/mtmd-video.cpp b/tools/mtmd/mtmd-video.cpp
new file mode 100644
index 0000000000000..503d9058487c7
--- /dev/null
+++ b/tools/mtmd/mtmd-video.cpp
@@ -0,0 +1,652 @@
+#include <cstdint>
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#   define NOMINMAX
+#endif
+#include <windows.h>
+#else
+#include <dirent.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#endif
+
+#include "mtmd-video.h"
+#include "ggml.h"
+#include "mtmd-helper.h"
+
+#include <algorithm>
+#include <string>
+#include <vector>
+#include <cstdio>
+#include <cerrno>
+#include <cstring>
+#include <memory>
+#include <cmath>
+#include <fstream>
+#include <queue>
+
+#define LOG_INF(...) fprintf(stdout, __VA_ARGS__)
+#define LOG_WRN(...) fprintf(stdout, __VA_ARGS__)
+#define LOG_ERR(...) fprintf(stderr, __VA_ARGS__)
+
+static bool is_image_file(const std::string & path) {
+    // check extension first
+    std::string lower = path;
+    std::transform(lower.begin(), lower.end(), lower.begin(),
+                   [](unsigned char c){ return (char)std::tolower(c); });
+
+    bool has_ext = lower.rfind(".jpg")  != std::string::npos ||
+                   lower.rfind(".jpeg") != std::string::npos ||
+                   lower.rfind(".png")  != std::string::npos ||
+                   lower.rfind(".bmp")  != std::string::npos ||
+                   lower.rfind(".gif")  != std::string::npos ||
+                   lower.rfind(".webp") != std::string::npos;
+
+    if (!has_ext) return false;
+
+    // check magic bytes then
+    std::ifstream f(path, std::ios::binary);
+    if (!f.is_open()) return false;
+
+    std::vector<unsigned char> header(16);
+    f.read(reinterpret_cast<char*>(header.data()), header.size());
+    size_t n = f.gcount();
+    if (n < 4) return false; // too small to be an image
+
+    // JPEG: FF D8 FF
+    if (n >= 3 && header[0] == 0xFF && header[1] == 0xD8 && header[2] == 0xFF)
+        return true;
+
+    // PNG: 89 50 4E 47 0D 0A 1A 0A
+    if (n >= 8 &&
+        header[0] == 0x89 && header[1] == 0x50 &&
+        header[2] == 0x4E && header[3] == 0x47 &&
+        header[4] == 0x0D && header[5] == 0x0A &&
+        header[6] == 0x1A && header[7] == 0x0A)
+        return true;
+
+    // BMP: 42 4D
+    if (n >= 2 && header[0] == 0x42 && header[1] == 0x4D)
+        return true;
+
+    // GIF: "GIF87a" or "GIF89a"
+    if (n >= 6 &&
+        header[0] == 'G' && header[1] == 'I' && header[2] == 'F' &&
+        header[3] == '8' &&
+        (header[4] == '7' || header[4] == '9') &&
+        header[5] == 'a')
+        return true;
+
+    // WEBP: "RIFF....WEBP"
+    if (n >= 12 &&
+        header[0] == 'R' && header[1] == 'I' && header[2] == 'F' && header[3] == 'F' &&
+        header[8] == 'W' && header[9] == 'E' && header[10] == 'B' && header[11] == 'P')
+        return true;
+
+    return false;
+}
+
+static bool is_dir(const std::string & path) {
+#if defined(_WIN32)
+    DWORD attrs = GetFileAttributesA(path.c_str());
+    return (attrs != INVALID_FILE_ATTRIBUTES) && (attrs & FILE_ATTRIBUTE_DIRECTORY);
+#else
+    struct stat st;
+    if (stat(path.c_str(), &st) != 0) return false;
+    return S_ISDIR(st.st_mode);
+#endif
+}
+
+static void list_files(const std::string & dir, std::vector<std::string> & out, bool recursive) {
+#if defined(_WIN32)
+    std::string pattern = dir;
+    if (!pattern.empty() && pattern.back() != '/' && pattern.back() != '\\') pattern += "\\";
+    pattern += "*";
+    WIN32_FIND_DATAA ffd;
+    HANDLE hFind = FindFirstFileA(pattern.c_str(), &ffd);
+    if (hFind == INVALID_HANDLE_VALUE) return;
+    do {
+        std::string name = ffd.cFileName;
+        if (name == "." || name == "..") continue;
+        std::string path = dir;
+        if (!path.empty() && path.back() != '/' && path.back() != '\\') path += "\\";
+        path += name;
+        if (ffd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) {
+            if (recursive) list_files(path, out, recursive);
+        } else {
+            out.push_back(path);
+        }
+    } while (FindNextFileA(hFind, &ffd) != 0);
+    FindClose(hFind);
+#else
+    DIR * dp = opendir(dir.c_str());
+    if (!dp) return;
+    struct dirent * de;
+    while ((de = readdir(dp)) != nullptr) {
+        std::string name = de->d_name;
+        if (name == "." || name == "..") continue;
+        std::string path = dir + "/" + name;
+        if (is_dir(path)) {
+            if (recursive) list_files(path, out, recursive);
+        } else {
+            out.push_back(path);
+        }
+    }
+    closedir(dp);
+#endif
+}
+
+namespace mtmd_video {
+
+bool is_video_file(const std::string & path){
+    if(is_dir(path)) return true; // a directory containing frames images is also considered a video
+    auto lower = path;
+    std::transform(lower.begin(), lower.end(), lower.begin(), [](unsigned char c){ return (char)std::tolower(c); });
+    return lower.rfind(".mp4")  != std::string::npos ||
+           lower.rfind(".mov")  != std::string::npos ||
+           lower.rfind(".mkv")  != std::string::npos ||
+           lower.rfind(".avi")  != std::string::npos ||
+           lower.rfind(".webm") != std::string::npos;
+}
+
+static void get_video_info_from_dir(const std::string &path, VideoInfo &info){
+    info.fps = 1; // do not care
+    std::vector<std::string> files;
+    list_files(path, files, true); // recursive
+    info.total_frames = files.size();
+}
+// untested
+static mtmd_bitmap* load_frames_from_dir(mtmd_context * ctx,
+                          const std::string & dir_path,
+                          const LoadVideoOptions & opts) {
+    if (!ctx || dir_path.empty() || !is_dir(dir_path) || opts.max_frames < 1) {
+        return nullptr;
+    }
+    std::vector<std::string> files;
+    list_files(dir_path, files, opts.recursive);
+    std::sort(files.begin(), files.end());
+
+    auto stride = std::max(1u, opts.stride);
+    size_t loaded = 0;
+    mtmd_bitmap* out_frames = nullptr;
+    const auto nframes = std::min(files.size() / stride, (size_t)opts.max_frames);
+
+    uint32_t w=0, h=0;
+    for (size_t i = 0; i < files.size(); i++) {
+        if (i % stride != 0) continue;
+        const std::string & f = files[i];
+        if (!is_image_file(f)) continue;
+        mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx, f.c_str()));
+        if (!bmp.ptr) continue;
+        if(loaded==0){
+            w = bmp.nx();
+            h = bmp.ny();
+            out_frames = mtmd_bitmap_init_from_video(w, h, nframes, nullptr);
+        }
+        GGML_ASSERT(bmp.nx() == w && bmp.ny() == h); // all frames must have the same size
+        mtmd_bitmap_set_frame(out_frames, loaded, bmp.data());
+        loaded++;
+        if (loaded >= nframes) break;
+    }
+    
+    return out_frames;
+}
+
+// This Frames Queue is useful for streaming use case, 
+// This frame queue is designed for streaming scenarios,
+// where an FFmpeg-related thread acts as the producer generating frames,
+// and the main program (the LLM inference thread) acts as the consumer processing them.
+struct DecodedFramesQueue {
+    int width;
+    int height;
+    std::queue<std::vector<unsigned char>> frames; // frame size = width * height * 3
+};
+
+static mtmd_video::LoadVideoOptions get_video_sample_options(mtmd_video::VideoInfo info){
+    mtmd_video::LoadVideoOptions opts;
+    opts.max_frames = 32;
+    opts.stride     = 1;
+    opts.recursive  = false;
+
+    /* MiniCPM-V normal-speed video frames sample method */
+    bool debug_use_small_frames_for_video = std::getenv("MTMD_MAX_VIDEO_FRAMES_SMALL") != nullptr;
+    int32_t minicpmv_max_video_frames = 64;
+    if(debug_use_small_frames_for_video){
+        // set a small number of frames for fast test locally
+        minicpmv_max_video_frames = 4;
+    }
+    opts.max_frames = minicpmv_max_video_frames;
+    if(info.total_frames > minicpmv_max_video_frames) {
+        // uniform sample
+        opts.stride = (int)std::ceil((double)info.total_frames / minicpmv_max_video_frames);
+    } else {
+        // 1 frame per second
+        opts.stride = (info.fps > 1.0) ? (int)std::ceil(info.fps) : 1;
+    }
+    
+    return opts;
+}
+
+// --- FFmpeg-based file decoding (optional) ---
+
+#ifdef MTMD_WITH_FFMPEG
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+extern "C" {
+#include <libavformat/avformat.h>
+#include <libavcodec/avcodec.h>
+#include <libswscale/swscale.h>
+#include <libavutil/imgutils.h>
+}
+#pragma GCC diagnostic pop
+
+bool is_video_buffer(const uint8_t *data, size_t size){
+    if (!data || size < 16) return false; // too short
+
+    AVProbeData probe;
+    probe.filename = "";
+
+    // ffmpeg requires that the last AVPROBE_PADDING_SIZE bytes of the buffer must be 0
+    auto * padded = new uint8_t[size + AVPROBE_PADDING_SIZE];
+    memcpy(padded, data, size);
+    memset(padded + size, 0, AVPROBE_PADDING_SIZE);
+    probe.buf = padded;
+    probe.buf_size = (int)size;
+
+    int score = 0;
+    const AVInputFormat *fmt = av_probe_input_format2(&probe, 1, &score);
+
+    delete [] padded;
+    
+    if (!fmt || score < 25) return false;
+    if (fmt->flags & AVFMT_NOFILE) return false;
+
+    static const char *image_formats[] = {
+        "jpeg_pipe", "png_pipe", "bmp_pipe", "gif_pipe", "webp_pipe",
+        "tiff_pipe", "image2", "image2pipe", "mjpeg"
+    };
+    for (const auto* name : image_formats)
+        if (fmt->name && strstr(fmt->name, name))
+            return false;
+
+    return true;
+}
+
+struct BufferData {
+    const uint8_t* base;
+    size_t size;
+    size_t pos;
+    BufferData(const uint8_t* b, size_t s) : base(b), size(s), pos(0) {}
+};
+
+static int read_packet(void* opaque, uint8_t* buf, int buf_size) {
+    BufferData* bd = static_cast<BufferData*>(opaque);
+    if (!bd || !bd->base) return AVERROR(EIO);
+    if (bd->pos >= bd->size) return AVERROR_EOF;
+    size_t rem = bd->size - bd->pos;
+    int to_read = (int)(rem < (size_t)buf_size ? rem : (size_t)buf_size);
+    if (to_read == 0) return AVERROR_EOF;
+    memcpy(buf, bd->base + bd->pos, to_read);
+    bd->pos += to_read;
+    return to_read;
+}
+
+static int64_t seek_packet(void* opaque, int64_t offset, int whence) {
+    BufferData* bd = static_cast<BufferData*>(opaque);
+    if (!bd) return -1;
+    if (whence == AVSEEK_SIZE) return (int64_t)bd->size;
+    size_t newpos = bd->pos;
+    if (whence == SEEK_SET) {
+        if (offset < 0 || (size_t)offset > bd->size) return -1;
+        newpos = (size_t)offset;
+    } else if (whence == SEEK_CUR) {
+        if (offset < 0 && (size_t)(-offset) > bd->pos) return -1;
+        newpos = bd->pos + (size_t)offset;
+        if (newpos > bd->size) return -1;
+    } else if (whence == SEEK_END) {
+        if (offset > 0 || (size_t)(-offset) > bd->size) return -1;
+        newpos = bd->size + (size_t)offset;
+    } else return -1;
+    bd->pos = newpos;
+    return (int64_t)bd->pos;
+}
+
+static bool create_format_context_from_buffer(const uint8_t* buffer, size_t size,
+                                       AVFormatContext*& fmt,
+                                       AVIOContext*& avio_ctx) {
+    fmt = nullptr;
+    avio_ctx = nullptr;
+    uint8_t* avio_ctx_buffer = nullptr;
+
+    if (!buffer || size == 0) return false;
+
+    // allocate BufferData
+    BufferData* bd = new (std::nothrow) BufferData(buffer, size);
+    if (!bd) return false;
+
+    const int AVIO_BUF_SIZE = 4096;
+    avio_ctx_buffer = static_cast<uint8_t*>(av_malloc(AVIO_BUF_SIZE));
+    if (!avio_ctx_buffer) {
+        delete bd;
+        return false;
+    }
+
+    avio_ctx = avio_alloc_context(
+        avio_ctx_buffer, AVIO_BUF_SIZE,
+        0, // read only
+        bd,
+        &read_packet,
+        nullptr,
+        &seek_packet
+    );
+
+    if (!avio_ctx) {
+        av_free(avio_ctx_buffer);
+        delete bd;
+        avio_ctx_buffer = nullptr;
+        return false;
+    }
+
+    fmt = avformat_alloc_context();
+    if (!fmt) {
+        // avio_context_free frees ctx->buffer but NOT opaque
+        if (avio_ctx->opaque) delete static_cast<BufferData*>(avio_ctx->opaque);
+        avio_context_free(&avio_ctx);
+        avio_ctx_buffer = nullptr;
+        return false;
+    }
+
+    fmt->pb = avio_ctx;
+    fmt->flags |= AVFMT_FLAG_CUSTOM_IO;
+
+    // increase probing - optional but helpful for truncated/streamed files
+    AVDictionary* opts = nullptr;
+    av_dict_set(&opts, "probesize", "5000000", 0);
+    av_dict_set(&opts, "analyzeduration", "5000000", 0);
+
+    int ret = avformat_open_input(&fmt, "stream", nullptr, &opts);
+    av_dict_free(&opts);
+
+    if (ret < 0) {
+        // Clean up carefully
+        // If fmt exists and has pb, free pb and opaque appropriately
+        if (fmt) {
+            AVIOContext* pb = fmt->pb;
+            BufferData* bd_from_fmt = pb ? static_cast<BufferData*>(pb->opaque) : nullptr;
+            avformat_free_context(fmt);
+            if (pb) {
+                delete bd_from_fmt;
+                avio_context_free(&pb); // frees pb->buffer
+            }
+            fmt = nullptr;
+        } else {
+            // fmt null: free avio_ctx and opaque
+            if (avio_ctx) {
+                if (avio_ctx->opaque) delete static_cast<BufferData*>(avio_ctx->opaque);
+                avio_context_free(&avio_ctx);
+                avio_ctx = nullptr;
+            }
+        }
+        avio_ctx_buffer = nullptr;
+        return false;
+    }
+
+    // success: avformat_open_input succeeded, fmt and pb are owned by caller,
+    // but opaque (BufferData) must be deleted by us later (avformat_close_input won't delete opaque).
+    return true;
+}
+
+static void free_format_context_from_buffer(AVFormatContext* fmt,
+                                     AVIOContext* avio_ctx) {
+    if (fmt) {
+        // capture pb->opaque BEFORE closing
+        AVIOContext* pb = fmt->pb;
+        BufferData* bd = nullptr;
+        if (pb) bd = static_cast<BufferData*>(pb->opaque);
+
+        // this closes fmt and frees pb (and pb->buffer)
+        avformat_close_input(&fmt);
+
+        // avformat_close_input does not free opaque, so free it now
+        if (bd) {
+            delete bd;
+            bd = nullptr;
+        }
+        // do NOT av_free(avio_ctx_buffer) here - it was freed with pb->buffer
+        return;
+    }
+
+    // partial failure case: fmt is null but avio_ctx may still be valid
+    if (avio_ctx) {
+        BufferData* bd = static_cast<BufferData*>(avio_ctx->opaque);
+        delete bd;
+        avio_context_free(&avio_ctx); // frees avio_ctx->buffer
+        // avio_ctx_buffer already freed by avio_context_free
+        return;
+    }
+}
+
+
+static bool get_video_info_from_format_ctx(AVFormatContext *fmt, VideoInfo &info) {
+    if (!fmt) return false;
+    
+    if (avformat_find_stream_info(fmt, nullptr) < 0) {
+        return false;
+    }
+
+    // find video stream
+    int vstream = av_find_best_stream(fmt, AVMEDIA_TYPE_VIDEO, -1, -1, nullptr, 0);
+    if (vstream < 0) {
+        return false;
+    }
+
+    AVStream *st = fmt->streams[vstream];
+
+    // get fps
+    if (st->avg_frame_rate.num > 0 && st->avg_frame_rate.den > 0){
+        info.fps = av_q2d(st->avg_frame_rate);
+    }else if (st->r_frame_rate.num > 0 && st->r_frame_rate.den > 0){
+        info.fps = av_q2d(st->r_frame_rate);
+    }
+    // get total frames
+    if (st->nb_frames > 0){
+        info.total_frames = st->nb_frames;
+    }else if (fmt->duration > 0 && info.fps > 0.0){
+        // estimate total frames if nb_frames is not available
+        info.total_frames = std::llround((fmt->duration / (double)AV_TIME_BASE) * info.fps);
+    }
+
+    return true;
+}
+
+static bool decode_video_ffmpeg_to_rgba_from_format_ctx(
+    AVFormatContext* fmt,
+    DecodedFramesQueue& out_q,
+    mtmd_video::LoadVideoOptions opts) 
+{
+    const auto stride = opts.stride;
+    const auto max_frames = opts.max_frames;
+    if(!fmt || stride <= 0 || max_frames <= 0) return false;
+    // vstream checked before in get_video_info_from_format_ctx
+    int vstream = av_find_best_stream(fmt, AVMEDIA_TYPE_VIDEO, -1, -1, nullptr, 0);
+    if (vstream < 0) return false;
+    AVStream * st = fmt->streams[vstream];
+    const AVCodec * dec = avcodec_find_decoder(st->codecpar->codec_id);
+    if (!dec){
+        LOG_ERR("%s: Could not find decoder for codec id %d\n", __func__, st->codecpar->codec_id);
+        return false;
+    }
+    AVCodecContext * ctx = avcodec_alloc_context3(dec);
+    if (!ctx){
+        LOG_ERR("%s: Could not allocate codec context\n", __func__);
+        return false;
+    }
+    std::unique_ptr<AVCodecContext, void(*)(AVCodecContext*)> ctx_guard(ctx, [](AVCodecContext *c){ if (c) avcodec_free_context(&c); });
+    if (avcodec_parameters_to_context(ctx, st->codecpar) < 0){
+        LOG_ERR("%s: Could not copy codec parameters to codec context\n", __func__);
+        return false;
+    }
+    if (avcodec_open2(ctx, dec, nullptr) < 0){
+        LOG_ERR("%s: Could not open codec2\n", __func__);
+        return false;
+    }
+
+    AVFrame * frame = av_frame_alloc();
+    AVPacket * pkt  = av_packet_alloc();
+    std::unique_ptr<AVFrame, void(*)(AVFrame*)> frame_guard(frame, [](AVFrame *f){ if (f) av_frame_free(&f); });
+    std::unique_ptr<AVPacket, void(*)(AVPacket*)> pkt_guard(pkt, [](AVPacket *p){ if (p) av_packet_free(&p); });
+
+    SwsContext * sws = nullptr;
+    uint32_t idx = 0;
+    uint32_t taken = 0;
+    const int w = ctx->width;
+    const int h = ctx->height;
+
+    if(!out_q.frames.empty() && (out_q.height!=h || out_q.width!=w)){
+        LOG_ERR("Different size frames remain in decoded frames queue");
+        return false;
+    }
+    out_q.width = w;
+    out_q.height = h;
+
+    while (av_read_frame(fmt, pkt) >= 0) {
+        if (pkt->stream_index != vstream) { av_packet_unref(pkt); continue; }
+        if (avcodec_send_packet(ctx, pkt) < 0) { av_packet_unref(pkt); break; }
+        av_packet_unref(pkt);
+        while (avcodec_receive_frame(ctx, frame) == 0) {
+            if (idx++ % stride != stride/2) { av_frame_unref(frame); continue; }
+            if (!sws) {
+                sws = sws_getContext(frame->width, frame->height, (AVPixelFormat)frame->format,
+                                        frame->width, frame->height, AV_PIX_FMT_RGB24,
+                                        SWS_BILINEAR, nullptr, nullptr, nullptr);
+                if (frame->format == AV_PIX_FMT_NONE) { av_frame_unref(frame); continue; }
+                if (!sws){
+                    LOG_ERR("%s: Could not initialize sws context\n", __func__);
+                    av_frame_unref(frame);
+                    return false;
+                }
+            }
+
+            GGML_ASSERT(w==frame->width && h==frame->height);
+
+            std::vector<uint8_t> out;
+            out.resize((size_t)frame->width * frame->height * 3);
+
+            uint8_t *dst_data[4] = { out.data(), nullptr, nullptr, nullptr };
+            int dst_linesize[4] = { frame->width * 3, 0, 0, 0 };
+            sws_scale(sws, frame->data, frame->linesize, 0, frame->height, dst_data, dst_linesize);
+        
+            out_q.frames.emplace(out);
+            taken++;
+            av_frame_unref(frame);
+            if (max_frames > 0 && taken >= max_frames) break;
+        }
+    }
+    if (sws) sws_freeContext(sws);
+    return taken > 0;
+}
+
+static mtmd_bitmap* convert_frames_to_bitmap(mtmd_context * ctx, DecodedFramesQueue& decoded) {
+    if (!ctx) return nullptr;
+    if(decoded.frames.empty()) return nullptr;
+    const size_t nframes = decoded.frames.size();
+    if(nframes < 1){
+        return nullptr;
+    }
+    const int w = decoded.width;
+    const int h = decoded.height;
+    mtmd_bitmap* out_frames = mtmd_bitmap_init_from_video(uint32_t(w), uint32_t(h), uint32_t(nframes), nullptr);
+    for(size_t i=0;i<nframes;++i){
+        mtmd_bitmap_set_frame(out_frames, i, decoded.frames.front().data());
+        decoded.frames.pop();
+    }
+
+    return out_frames;
+}
+
+mtmd_bitmap* init_video_bitmap(mtmd_context * ctx, const uint8_t* buffer, size_t size){
+    auto info = mtmd_video::VideoInfo{};
+    AVFormatContext* fmt = nullptr;
+    AVIOContext* avio_ctx = nullptr;
+    GGML_ASSERT(create_format_context_from_buffer(buffer, size, fmt, avio_ctx));
+
+    if(!get_video_info_from_format_ctx(fmt, info)) {
+        LOG_ERR("Unable to get video info from buffer\n");
+        free_format_context_from_buffer(fmt, avio_ctx);
+        return nullptr;
+    }
+
+    const auto opts = get_video_sample_options(info);
+
+    DecodedFramesQueue frames;
+    if(!decode_video_ffmpeg_to_rgba_from_format_ctx(fmt, frames, opts)){
+        LOG_ERR("Unable to decode video from buffer\n");
+        free_format_context_from_buffer(fmt, avio_ctx);
+        return nullptr;
+    }
+
+    auto * res = convert_frames_to_bitmap(ctx, frames);
+    free_format_context_from_buffer(fmt, avio_ctx);
+    return res;
+}
+#else
+bool is_video_buffer(const uint8_t */*data*/, size_t /*size*/){
+    LOG_WRN("FFmpeg support is not enabled in this build, can not check it\n");
+    return false;
+}
+mtmd_bitmap* init_video_bitmap(mtmd_context * /*ctx*/, const uint8_t* /*buffer*/, size_t /*size*/){
+    LOG_ERR("FFmpeg support is not enabled in this build, can not load video from buffer\n");
+    return nullptr;
+}
+#endif
+
+mtmd_bitmap* init_video_bitmap(mtmd_context * ctx, const std::string & path) {
+    auto info = mtmd_video::VideoInfo{};
+    
+    if(is_dir(path)){
+        get_video_info_from_dir(path, info);
+        const auto opts = get_video_sample_options(info);
+        return load_frames_from_dir(ctx, path, opts);
+    }
+
+    // handle file otherwise
+
+    #ifdef MTMD_WITH_FFMPEG
+    AVFormatContext* fmt = nullptr;
+    LOG_INF("Loading video from path: %s\n", path.c_str());
+    if (avformat_open_input(&fmt, path.c_str(), nullptr, nullptr) < 0){
+        LOG_ERR("Unable to open video from path: %s\n", path.c_str());
+        if(fmt) avformat_close_input(&fmt);
+        return nullptr;
+    }
+    LOG_INF("get_video_info_from_format_ctx...\n");
+    if(!get_video_info_from_format_ctx(fmt, info)) {
+        LOG_ERR("Unable to get video info from path: %s\n", path.c_str());
+        if(fmt) avformat_close_input(&fmt);
+        return nullptr;
+    }
+
+    const auto opts = get_video_sample_options(info);
+
+    LOG_INF("decode_video_ffmpeg_to_rgba_from_format_ctx...\n");
+
+    DecodedFramesQueue frames;
+    if(!decode_video_ffmpeg_to_rgba_from_format_ctx(fmt, frames, opts)){
+        LOG_ERR("Unable to decode video from path: %s\n", path.c_str());
+        if(fmt) avformat_close_input(&fmt);
+        return nullptr;
+    }
+
+    auto * res = convert_frames_to_bitmap(ctx, frames);
+    if(fmt) avformat_close_input(&fmt);
+    return res;
+    #else
+    LOG_ERR("FFmpeg support is not enabled in this build, can not load video from file\n");
+    return nullptr;
+    #endif
+}
+
+} // namespace mtmd_video
+
+
diff --git a/tools/mtmd/mtmd-video.h b/tools/mtmd/mtmd-video.h
new file mode 100644
index 0000000000000..23795bf350470
--- /dev/null
+++ b/tools/mtmd/mtmd-video.h
@@ -0,0 +1,47 @@
+// mtmd-video helpers (experimental)
+// Minimal C++ helpers to load video frames (as image files) from a directory
+// and append them to mtmd::bitmaps for downstream tokenization.
+
+#ifndef MTMD_VIDEO_H
+#define MTMD_VIDEO_H
+
+#include "mtmd.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+namespace mtmd_video {
+
+struct LoadVideoOptions {
+    uint32_t max_frames = 32;   // maximum frames to load (<= 0 means no limit)
+    uint32_t stride     = 1;    // take every N-th frame
+    bool recursive = false;// scan subdirectories
+};
+
+struct VideoInfo {
+    double fps = 0.0;           // frames per second
+    int64_t total_frames = 0;   // total number of frames
+};
+
+// Check if a path is a video file based on its extension
+bool is_video_file(const std::string & path);
+
+// Check if a buffer contains video file data via FFmpeg
+// Notice: audio containers may also be recognized as valid media
+bool is_video_buffer(const uint8_t *data, size_t size);
+
+// Append frames loaded from a file or directory (auto-detect).
+// Returns a mtmd_bitmap containing all frames in RGB format.
+mtmd_bitmap* init_video_bitmap(mtmd_context * ctx,
+                               const std::string & path);
+mtmd_bitmap* init_video_bitmap(mtmd_context * ctx,
+                                 const uint8_t* buffer,
+                                 size_t size);
+
+} // namespace mtmd_video
+
+#endif // MTMD_VIDEO_H
+
+
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index 325f7ff995e36..e43a0007a5eda 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -16,19 +16,34 @@
 
 #include <algorithm>
 #include <cerrno>
+#include <cstdint>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
 #include <vector>
 
-// represents raw image data, layout is RGBRGBRGB...
-// length of data must be nx * ny * 3
+enum mtmd_bitmap_type {
+    MTMD_BITMAP_TYPE_IMAGE,
+    MTMD_BITMAP_TYPE_AUDIO,
+    MTMD_BITMAP_TYPE_VIDEO,
+};
+
+// if bitmap is image:
+//     length of data must be nx * ny * 3
+//     the data is in RGBRGBRGB... format
+// if bitmap is audio:
+//     length of data must be n_samples * sizeof(float)
+//     the data is in float format (PCM F32)
+// if bitmap is video:
+//     length of data must be nx * ny * nframes * 3
+//     the data is in RGBRGBRGB... format for each frame, frames are stored sequentially
 struct mtmd_bitmap {
+    mtmd_bitmap_type type;
     uint32_t nx;
-    uint32_t ny;
+    uint32_t ny; // for audio, ny=1
+    uint32_t nz; // for video: number of frames, always 1 for image and audio
     std::vector<unsigned char> data;
     std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
-    bool is_audio = false; // true if the bitmap is audio
 };
 
 struct mtmd_image_tokens {
@@ -87,6 +102,12 @@ enum mtmd_slice_tmpl {
     MTMD_SLICE_TMPL_IDEFICS3,
 };
 
+// video schema, used to tokenize video frames 
+enum mtmd_video_schema {
+    MTMD_VIDEO_SCHEMA_MINICPMV_2_6, // add overview image only (no slicing)
+    MTMD_VIDEO_SCHEMA_MINICPMV_4_5,
+};
+
 const char * mtmd_default_marker() {
     return "<__media__>";
 }
@@ -134,6 +155,7 @@ struct mtmd_context {
     // for llava-uhd style models, we need special tokens in-between slices
     // minicpmv calls them "slices", llama 4 calls them "tiles"
     mtmd_slice_tmpl slice_tmpl    = MTMD_SLICE_TMPL_NONE;
+    mtmd_video_schema video_sch   = MTMD_VIDEO_SCHEMA_MINICPMV_2_6;
     std::vector<llama_token> tok_ov_img_start;  // overview image
     std::vector<llama_token> tok_ov_img_end;    // overview image
     std::vector<llama_token> tok_slices_start;  // start of all slices
@@ -505,58 +527,61 @@ struct mtmd_tokenizer {
         }
     }
 
-    int32_t add_media(const mtmd_bitmap * bitmap) {
-        if (!bitmap->is_audio) {
-            // handle image
-
-            if (!ctx->ctx_v) {
-                LOG_ERR("%s: error: model does not support vision input\n", __func__);
-                return 2;
-            }
+    int32_t add_image(const mtmd_bitmap * bitmap, bool have_slices=true) {
+        if(mtmd_bitmap_is_audio(bitmap) || mtmd_bitmap_is_video(bitmap)){
+            LOG_ERR("%s: error: bitmap is not an image\n", __func__);
+            return 2;
+        }
+        // handle image
+        if (!ctx->ctx_v) {
+            LOG_ERR("%s: error: model does not support vision input\n", __func__);
+            return 2;
+        }
 
-            if (!ctx->img_beg.empty()) {
-                add_text(ctx->img_beg, true); // add image begin token
-            }
+        if (!ctx->img_beg.empty()) {
+            add_text(ctx->img_beg, true); // add image begin token
+        }
 
-            // convert mtmd_bitmap to clip_image_u8
-            clip_image_u8_ptr img_u8(clip_image_u8_init());
-            img_u8->nx = bitmap->nx;
-            img_u8->ny = bitmap->ny;
-            img_u8->buf.resize(bitmap->data.size());
-            std::memcpy(img_u8->buf.data(), bitmap->data.data(), img_u8->nx * img_u8->ny * 3);
+        // convert mtmd_bitmap to clip_image_u8
+        clip_image_u8_ptr img_u8(clip_image_u8_init());
+        img_u8->nx = bitmap->nx;
+        img_u8->ny = bitmap->ny;
+        img_u8->buf.resize(bitmap->data.size());
+        std::memcpy(img_u8->buf.data(), bitmap->data.data(), img_u8->nx * img_u8->ny * 3);
+
+        // preprocess image
+        clip_image_f32_batch batch_f32;
+        bool ok = clip_image_preprocess(ctx->ctx_v, img_u8.get(), &batch_f32);
+        if (!ok) {
+            LOG_ERR("Unable to preprocess image\n");
+            return 2;
+        }
 
-            // preprocess image
-            clip_image_f32_batch batch_f32;
-            bool ok = clip_image_preprocess(ctx->ctx_v, img_u8.get(), &batch_f32);
-            if (!ok) {
-                LOG_ERR("Unable to preprocess image\n");
-                return 2;
+        // handle llava-uhd style preprocessing
+        if (
+            ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
+            || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
+            || ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
+            || ctx->slice_tmpl == MTMD_SLICE_TMPL_IDEFICS3
+        ) {
+            const int n_col = batch_f32.grid_x;
+            const int n_row = batch_f32.grid_y;
+            // split batch into chunks of single images
+            // NOTE: batch_f32 will be invalidated after this call
+            auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmap->id);
+            GGML_ASSERT(chunks.size() > 0);
+
+            auto ov_chunk = std::move(chunks.front());
+            chunks.erase(chunks.begin());
+
+            // add overview image (first)
+            if (ctx->ov_img_first) {
+                add_text(ctx->tok_ov_img_start);
+                cur.entries.emplace_back(std::move(ov_chunk));
+                add_text(ctx->tok_ov_img_end);
             }
-
-            // handle llava-uhd style preprocessing
-            if (
-                ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
-                || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
-                || ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
-                || ctx->slice_tmpl == MTMD_SLICE_TMPL_IDEFICS3
-            ) {
-                const int n_col = batch_f32.grid_x;
-                const int n_row = batch_f32.grid_y;
-                // split batch into chunks of single images
-                // NOTE: batch_f32 will be invalidated after this call
-                auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmap->id);
-                GGML_ASSERT(chunks.size() > 0);
-
-                auto ov_chunk = std::move(chunks.front());
-                chunks.erase(chunks.begin());
-
-                // add overview image (first)
-                if (ctx->ov_img_first) {
-                    add_text(ctx->tok_ov_img_start);
-                    cur.entries.emplace_back(std::move(ov_chunk));
-                    add_text(ctx->tok_ov_img_end);
-                }
-
+            
+            if(have_slices){
                 // add slices (or tiles)
                 if (!chunks.empty()) {
                     GGML_ASSERT((int)chunks.size() == n_row * n_col);
@@ -585,52 +610,56 @@ struct mtmd_tokenizer {
                     }
                     add_text(ctx->tok_slices_end);
                 }
+            }
 
-                // add overview image (last)
-                if (!ctx->ov_img_first) {
-                    add_text(ctx->tok_ov_img_start);
-                    cur.entries.emplace_back(std::move(ov_chunk));
-                    add_text(ctx->tok_ov_img_end);
-                }
+            // add overview image (last)
+            if (!ctx->ov_img_first) {
+                add_text(ctx->tok_ov_img_start);
+                cur.entries.emplace_back(std::move(ov_chunk));
+                add_text(ctx->tok_ov_img_end);
+            }
 
-            } else {
-                size_t n_tokens = 0;
-                for (const auto & entry : batch_f32.entries) {
-                    n_tokens += clip_n_output_tokens(ctx->ctx_v, entry.get());
-                }
+        } else {
+            size_t n_tokens = 0;
+            for (const auto & entry : batch_f32.entries) {
+                n_tokens += clip_n_output_tokens(ctx->ctx_v, entry.get());
+            }
 
-                mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
-                if (ctx->use_mrope) {
-                    // for Qwen2VL, we need this information for M-RoPE decoding positions
-                    image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get());
-                    image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_v, batch_f32.entries[0].get());
-                    image_tokens->use_mrope_pos = true;
-                } else {
-                    // other models, we only need the total number of tokens
-                    image_tokens->nx = n_tokens;
-                    image_tokens->ny = 1;
-                }
-                image_tokens->batch_f32 = std::move(batch_f32);
-                image_tokens->id = bitmap->id; // optional
+            mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
+            if (ctx->use_mrope) {
+                // for Qwen2VL, we need this information for M-RoPE decoding positions
+                image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get());
+                image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_v, batch_f32.entries[0].get());
+                image_tokens->use_mrope_pos = true;
+            } else {
+                // other models, we only need the total number of tokens
+                image_tokens->nx = n_tokens;
+                image_tokens->ny = 1;
+            }
+            image_tokens->batch_f32 = std::move(batch_f32);
+            image_tokens->id = bitmap->id; // optional
 
-                LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
-                LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
-                LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size());
+            LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
+            LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
+            LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size());
 
-                mtmd_input_chunk chunk{
-                    MTMD_INPUT_CHUNK_TYPE_IMAGE,
-                    {}, // text tokens
-                    std::move(image_tokens),
-                    nullptr, // audio tokens
-                };
-                cur.entries.emplace_back(std::move(chunk));
-            }
+            mtmd_input_chunk chunk{
+                MTMD_INPUT_CHUNK_TYPE_IMAGE,
+                {}, // text tokens
+                std::move(image_tokens),
+                nullptr, // audio tokens
+            };
+            cur.entries.emplace_back(std::move(chunk));
+        }
 
-            if (!ctx->img_end.empty()) {
-                add_text(ctx->img_end, true); // add image end token
-            }
+        if (!ctx->img_end.empty()) {
+            add_text(ctx->img_end, true); // add image end token
+        }
+        return 0;
+    }
 
-        } else {
+    int32_t add_media(const mtmd_bitmap * bitmap) {
+        if(mtmd_bitmap_is_audio(bitmap)){
             // handle audio
 
             if (!ctx->ctx_a) {
@@ -690,9 +719,39 @@ struct mtmd_tokenizer {
             if (!ctx->aud_end.empty()) {
                 add_text(ctx->aud_end, true); // add audio end token
             }
+
+            return 0;
         }
 
-        return 0;
+        if(mtmd_bitmap_is_video(bitmap)){
+            // handle video
+            if (!ctx->ctx_v) {
+                LOG_ERR("%s: error: model does not support vision input\n", __func__);
+                return 2;
+            }
+
+            if (
+                ctx->video_sch == MTMD_VIDEO_SCHEMA_MINICPMV_4_5
+            ) {
+                // TODO
+                LOG_ERR("%s: error: video schema MINICPMV_4_5 is not implemented yet\n", __func__);
+                return 2;
+            }else{
+                const auto frame_size = bitmap->nx * bitmap->ny * 3;
+                for(uint32_t f = 0; f < bitmap->nz; f++){
+                    // extract frame f
+                    mtmd_bitmap* frame_bitmap = mtmd_bitmap_init(bitmap->nx, bitmap->ny, bitmap->data.data() + f * frame_size);
+                    // minicpmv 2.6 video understanding: disable UHD slicing (overview only)
+                    int32_t res = add_image(frame_bitmap, false /* have_slices */);
+                    mtmd_bitmap_free(frame_bitmap);
+                    if (res != 0) return res;
+                }
+            }
+            return 0;
+        }
+
+        // handle image otherwise
+        return add_image(bitmap);
     }
 
     std::vector<mtmd_input_chunk> split_batch_to_chunk(clip_image_f32_batch && batch_f32, const std::string & id) {
@@ -862,6 +921,7 @@ int mtmd_get_audio_bitrate(mtmd_context * ctx) {
     return 16000; // 16kHz
 }
 
+
 //
 // public API functions
 //
@@ -874,6 +934,8 @@ mtmd_bitmap * mtmd_bitmap_init(uint32_t nx,
     mtmd_bitmap * bitmap = new mtmd_bitmap;
     bitmap->nx = nx;
     bitmap->ny = ny;
+    bitmap->nz = 1;
+    bitmap->type = MTMD_BITMAP_TYPE_IMAGE;
     size_t data_size = (size_t)nx * ny * 3;
     bitmap->data.resize(data_size);
     std::memcpy(bitmap->data.data(), data, data_size);
@@ -885,13 +947,29 @@ mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples,
     mtmd_bitmap * bitmap = new mtmd_bitmap;
     bitmap->nx = n_samples;
     bitmap->ny = 1;
-    bitmap->is_audio = true;
+    bitmap->nz = 1;
+    bitmap->type = MTMD_BITMAP_TYPE_AUDIO;
     size_t data_size = n_samples * sizeof(float);
     bitmap->data.resize(data_size);
     std::memcpy(bitmap->data.data(), data, data_size);
     return bitmap;
 }
 
+mtmd_bitmap * mtmd_bitmap_init_from_video(uint32_t nx, uint32_t ny, uint32_t nframes, const unsigned char * data){
+    mtmd_bitmap * bitmap = new mtmd_bitmap;
+    bitmap->nx = nx;
+    bitmap->ny = ny;
+    bitmap->nz = nframes;
+    bitmap->type = MTMD_BITMAP_TYPE_VIDEO;
+    size_t data_size = (size_t)nx * ny * nframes * 3;
+    bitmap->data.resize(data_size);
+    if(data != nullptr){
+        std::memcpy(bitmap->data.data(), data, data_size);
+    }
+    return bitmap;
+}
+
+
 uint32_t mtmd_bitmap_get_nx(const mtmd_bitmap * bitmap) {
     return bitmap->nx;
 }
@@ -909,7 +987,11 @@ size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap) {
 }
 
 bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap) {
-    return bitmap->is_audio;
+    return bitmap->type == MTMD_BITMAP_TYPE_AUDIO;
+}
+
+bool mtmd_bitmap_is_video(const mtmd_bitmap * bitmap) {
+    return bitmap->type == MTMD_BITMAP_TYPE_VIDEO;
 }
 
 const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap) {
@@ -930,6 +1012,20 @@ void mtmd_bitmap_free(mtmd_bitmap * bitmap) {
     }
 }
 
+bool mtmd_bitmap_set_frame(mtmd_bitmap * bitmap, size_t i, const unsigned char * data) {
+    if (bitmap->type != MTMD_BITMAP_TYPE_VIDEO) {
+        LOG_ERR("%s: error: bitmap is not a video\n", __func__);
+        return false;
+    }
+    if (i >= (size_t)bitmap->nz) {
+        LOG_ERR("%s: error: frame index %zu out of range (nz = %u)\n", __func__, i, bitmap->nz);
+        return false;
+    }
+    size_t frame_size = (size_t)bitmap->nx * bitmap->ny * 3;
+    std::memcpy(bitmap->data.data() + i * frame_size, data, frame_size);
+    return true;
+}
+
 // mtmd_input_chunks
 
 mtmd_input_chunks * mtmd_input_chunks_init() {
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
index 775fba6215c7c..de5e32d06c780 100644
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@@ -125,14 +125,23 @@ MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx);
 // if bitmap is audio:
 //     length of data must be n_samples * sizeof(float)
 //     the data is in float format (PCM F32)
+// if bitmap is video:
+//     length of data must be nx * ny * nframes * 3
+//     the data is in RGBRGBRGB... format for each frame, frames are stored sequentially
+//     if data is nullptr, an empty bitmap is created (you can init content later)
 MTMD_API mtmd_bitmap *         mtmd_bitmap_init           (uint32_t nx, uint32_t ny, const unsigned char * data);
 MTMD_API mtmd_bitmap *         mtmd_bitmap_init_from_audio(size_t n_samples,         const float         * data);
+MTMD_API mtmd_bitmap *         mtmd_bitmap_init_from_video(uint32_t nx, uint32_t ny, uint32_t nframes, const unsigned char * data);
 MTMD_API uint32_t              mtmd_bitmap_get_nx     (const mtmd_bitmap * bitmap);
 MTMD_API uint32_t              mtmd_bitmap_get_ny     (const mtmd_bitmap * bitmap);
 MTMD_API const unsigned char * mtmd_bitmap_get_data   (const mtmd_bitmap * bitmap);
 MTMD_API size_t                mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap);
 MTMD_API bool                  mtmd_bitmap_is_audio   (const mtmd_bitmap * bitmap);
+MTMD_API bool                  mtmd_bitmap_is_video   (const mtmd_bitmap * bitmap);
 MTMD_API void                  mtmd_bitmap_free       (mtmd_bitmap * bitmap);
+// set single frame i of the video bitmap
+// return false if bitmap is not video or i is out of range
+MTMD_API bool                  mtmd_bitmap_set_frame  (mtmd_bitmap * bitmap, size_t i, const unsigned char * data);
 // bitmap ID is optional, but useful for KV cache tracking
 // these getters/setters are dedicated functions, so you can for example calculate the hash of the image based on mtmd_bitmap_get_data()
 MTMD_API const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap);
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index a8d7773c96809..ccf633a299df7 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -3983,7 +3983,6 @@ struct server_context {
                         }
 
                         slot.n_prompt_tokens_processed += n_tokens_out;
-
                         // add the image chunk to cache
                         {
                             const auto & chunk = input_tokens.find_chunk(slot.prompt.n_tokens());