ggml-org
diff --git a/‎common/arg.cpp‎
Lines changed: 7 additions & 0 deletions b/‎common/arg.cpp‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎common/common.h‎
Lines changed: 1 addition & 0 deletions b/‎common/common.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tools/mtmd/CMakeLists.txt‎
Lines changed: 20 additions & 0 deletions b/‎tools/mtmd/CMakeLists.txt‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎tools/mtmd/clip.cpp‎
Lines changed: 64 additions & 52 deletions b/‎tools/mtmd/clip.cpp‎
Lines changed: 64 additions & 52 deletions
diff --git a/‎tools/mtmd/clip.h‎
Lines changed: 1 addition & 0 deletions b/‎tools/mtmd/clip.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tools/mtmd/mtmd-cli.cpp‎
Lines changed: 77 additions & 13 deletions b/‎tools/mtmd/mtmd-cli.cpp‎
Lines changed: 77 additions & 13 deletions
@@ -2514,6 +2514,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.image.emplace_back(value);
         }
     ).set_examples({LLAMA_EXAMPLE_MTMD}));
+    add_opt(common_arg(
+        {"--video"}, "PATH",
+        "path to a video file (requires FFmpeg at build time) or a directory of frames; can be repeated.\n",
+        [](common_params & params, const std::string & value) {
+            params.video.emplace_back(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_MTMD}));
     if (llama_supports_rpc()) {
         add_opt(common_arg(
             {"--rpc"}, "SERVERS",
 
@@ -405,6 +405,7 @@ struct common_params {
     bool mmproj_use_gpu = true;     // use GPU for multimodal model
     bool no_mmproj = false;         // explicitly disable multimodal model
     std::vector<std::string> image; // path to image file(s)
+    std::vector<std::string> video; // path to video file(s) or frame directories
 
     // finetune
     struct lr_opt lr;
 
@@ -5,12 +5,14 @@ find_package(Threads REQUIRED)
 add_library(mtmd
             mtmd.cpp
             mtmd-audio.cpp
+            mtmd-video.cpp
             mtmd.h
             clip.cpp
             clip.h
             clip-impl.h
             mtmd-helper.cpp
             mtmd-helper.h
+            mtmd-video.h
             )
 
 target_link_libraries     (mtmd PUBLIC ggml llama)
@@ -20,6 +22,23 @@ target_include_directories(mtmd PRIVATE ../..)
 target_include_directories(mtmd PRIVATE ../../vendor)
 target_compile_features   (mtmd PRIVATE cxx_std_17)
 
+# Optional FFmpeg support for video decoding
+option(MTMD_WITH_FFMPEG "Enable FFmpeg-based video decoding in mtmd-video" OFF)
+if (MTMD_WITH_FFMPEG)
+    find_package(PkgConfig QUIET)
+    if (PKG_CONFIG_FOUND)
+        pkg_check_modules(FFMPEG QUIET IMPORTED_TARGET libavformat libavcodec libswscale libavutil)
+        if (FFMPEG_FOUND)
+            target_link_libraries(mtmd PRIVATE PkgConfig::FFMPEG)
+            target_compile_definitions(mtmd PRIVATE MTMD_WITH_FFMPEG)
+        else()
+            message(WARNING "FFmpeg not found via pkg-config; MTMD_WITH_FFMPEG disabled")
+        endif()
+    else()
+        message(WARNING "pkg-config not found; MTMD_WITH_FFMPEG disabled")
+    endif()
+endif()
+
 if (BUILD_SHARED_LIBS)
     set_target_properties     (mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
     target_compile_definitions(mtmd PRIVATE LLAMA_BUILD)
@@ -29,6 +48,7 @@ endif()
 set(MTMD_PUBLIC_HEADERS
     ${CMAKE_CURRENT_SOURCE_DIR}/mtmd.h
     ${CMAKE_CURRENT_SOURCE_DIR}/mtmd-helper.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/mtmd-video.h
     )
 
 set_target_properties(mtmd
 
@@ -201,6 +201,7 @@ struct clip_hparams {
     // legacy
     bool has_llava_projector = false;
     int minicpmv_version = 0;
+    int minicpmv_max_slice_nums = 9;
     int32_t minicpmv_query_num = 0;         // MiniCPM-V query number
 };
 
@@ -3260,16 +3261,67 @@ struct llava_uhd {
         const bool has_slices    = original_size.width > slice_size || original_size.height > slice_size;
         const bool has_pinpoints = !ctx->model.hparams.image_res_candidates.empty();
 
-        if (!has_slices) {
-            // skip slicing logic
-            res.overview_size = clip_image_size{slice_size, slice_size};
-            res.refined_size  = clip_image_size{0, 0};
-            res.grid_size     = clip_image_size{0, 0};
+        if (clip_is_minicpmv(ctx)) {
+            auto best_size    = get_best_resize(original_size, slice_size, patch_size, !has_slices);
+            res.overview_size = best_size;
+
+            {
+                const int max_slice_nums = ctx->model.hparams.minicpmv_max_slice_nums;
+                const float log_ratio = log((float)original_width / original_height);
+                const float ratio = (float)original_width * original_height / (slice_size * slice_size);
+                const int multiple = fmin(ceil(ratio), max_slice_nums);
+
+                auto best_grid   = get_best_grid(max_slice_nums, multiple, log_ratio);
+                auto refine_size = get_refine_size(original_size, best_grid, slice_size, patch_size, true);
+                res.grid_size    = best_grid;
+                res.refined_size = refine_size;
+
+                LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n",
+                        __func__, original_width, original_height,
+                        res.overview_size.width, res.overview_size.height,
+                        res.refined_size.width, res.refined_size.height,
+                        res.grid_size.width, res.grid_size.height);
+
+                if (!has_slices || max_slice_nums == 0) {
+                    return res;
+                }
+
+                int width  = refine_size.width;
+                int height = refine_size.height;
+                int grid_x = int(width  / best_grid.width);
+                int grid_y = int(height / best_grid.height);
+                for (int patches_y = 0,                    ic = 0;
+                        patches_y < refine_size.height && ic < best_grid.height;
+                        patches_y += grid_y,              ic += 1) {
+                    for (int patches_x = 0,                   jc = 0;
+                            patches_x < refine_size.width && jc < best_grid.width;
+                            patches_x += grid_x,             jc += 1) {
+                        slice_coordinates slice;
+                        slice.x = patches_x;
+                        slice.y = patches_y;
+                        slice.size.width  = grid_x;
+                        slice.size.height = grid_y;
+                        res.slices.push_back(slice);
+                        LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n",
+                                __func__, (int)res.slices.size() - 1,
+                                slice.x, slice.y, slice.size.width, slice.size.height);
+                    }
+                }
+            }
 
             return res;
         }
+        else {
+            if (!has_slices) {
+                // skip slicing logic
+                res.overview_size = clip_image_size{slice_size, slice_size};
+                res.refined_size  = clip_image_size{0, 0};
+                res.grid_size     = clip_image_size{0, 0};
 
-        if (has_pinpoints) {
+                return res;
+            }
+
+            if (has_pinpoints) {
             // has pinpoints, use them to calculate the grid size (e.g. llava-1.6)
             auto refine_size = llava_uhd::select_best_resolution(
                 original_size,
@@ -3305,53 +3357,7 @@ struct llava_uhd {
 
             return res;
         }
-
-        // no pinpoints, dynamically calculate the grid size (e.g. minicpmv)
-
-        auto best_size    = get_best_resize(original_size, slice_size, patch_size, !has_slices);
-        res.overview_size = best_size;
-
-        {
-            const int max_slice_nums = 9; // TODO: this is only used by minicpmv, maybe remove it
-            const float log_ratio = log((float)original_width / original_height);
-            const float ratio = (float)original_width * original_height / (slice_size * slice_size);
-            const int multiple = fmin(ceil(ratio), max_slice_nums);
-
-            auto best_grid   = get_best_grid(max_slice_nums, multiple, log_ratio);
-            auto refine_size = get_refine_size(original_size, best_grid, slice_size, patch_size, true);
-            res.grid_size    = best_grid;
-            res.refined_size = refine_size;
-
-            LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n",
-                    __func__, original_width, original_height,
-                    res.overview_size.width, res.overview_size.height,
-                    res.refined_size.width, res.refined_size.height,
-                    res.grid_size.width, res.grid_size.height);
-
-            int width  = refine_size.width;
-            int height = refine_size.height;
-            int grid_x = int(width  / best_grid.width);
-            int grid_y = int(height / best_grid.height);
-            for (int patches_y = 0,                    ic = 0;
-                    patches_y < refine_size.height && ic < best_grid.height;
-                    patches_y += grid_y,              ic += 1) {
-                for (int patches_x = 0,                   jc = 0;
-                        patches_x < refine_size.width && jc < best_grid.width;
-                        patches_x += grid_x,             jc += 1) {
-                    slice_coordinates slice;
-                    slice.x = patches_x;
-                    slice.y = patches_y;
-                    slice.size.width  = grid_x;
-                    slice.size.height = grid_y;
-                    res.slices.push_back(slice);
-                    LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n",
-                            __func__, (int)res.slices.size() - 1,
-                            slice.x, slice.y, slice.size.width, slice.size.height);
-                }
-            }
         }
-
-        return res;
     }
 
     static std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 * img, const slice_instructions & inst) {
@@ -4390,6 +4396,12 @@ bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
         || ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL;
 }
 
+void clip_set_minicpmv_max_slice_nums(struct clip_ctx * ctx, int n) {
+    if (!ctx) return;
+    if (n < 0) n = 0;
+    ctx->model.hparams.minicpmv_max_slice_nums = n;
+}
+
 bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
     clip_image_f32 clip_img;
     clip_img.buf.resize(h * w * 3);
 
@@ -91,6 +91,7 @@ bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_i
 bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
 
 int clip_is_minicpmv(const struct clip_ctx * ctx);
+void clip_set_minicpmv_max_slice_nums(struct clip_ctx * ctx, int n);
 bool clip_is_glm(const struct clip_ctx * ctx);
 bool clip_is_qwen2vl(const struct clip_ctx * ctx);
 bool clip_is_llava(const struct clip_ctx * ctx);
 
@@ -6,12 +6,15 @@
 #include "ggml.h"
 #include "console.h"
 #include "chat.h"
+#include "clip.h"
 #include "mtmd.h"
 #include "mtmd-helper.h"
+#include "mtmd-video.h"
 
 #include <vector>
 #include <limits.h>
 #include <cinttypes>
+#include <cstdlib>
 
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
 #include <signal.h>
@@ -158,6 +161,23 @@ struct mtmd_cli_context {
         bitmaps.entries.push_back(std::move(bmp));
         return true;
     }
+
+    // Load multiple frames from a directory as a "video" (sequence of images)
+    // Returns number of frames appended
+    size_t load_video_dir(const std::string & dir, int max_frames = 32, int stride = 1, bool recursive = false) {
+        mtmd_video::LoadVideoOptions opts;
+        opts.max_frames = max_frames;
+        opts.stride     = stride;
+        opts.recursive  = recursive;
+        return mtmd_video::append_frames_from_dir(ctx_vision.get(), dir, bitmaps, opts);
+    }
+
+    size_t load_video_path(const std::string & path, int max_frames = 32, int stride = 1) {
+        mtmd_video::LoadVideoOptions opts;
+        opts.max_frames = max_frames;
+        opts.stride     = stride;
+        return mtmd_video::append_frames_from_path(ctx_vision.get(), path, bitmaps, opts);
+    }
 };
 
 static int generate_response(mtmd_cli_context & ctx, int n_predict) {
@@ -266,7 +286,7 @@ int main(int argc, char ** argv) {
     mtmd_cli_context ctx(params);
     LOG("%s: loading model: %s\n", __func__, params.model.path.c_str());
 
-    bool is_single_turn = !params.prompt.empty() && !params.image.empty();
+    bool is_single_turn = !params.prompt.empty() && (!params.image.empty() || !params.video.empty());
 
     int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict;
 
@@ -290,19 +310,38 @@ int main(int argc, char ** argv) {
 
     if (is_single_turn) {
         g_is_generating = true;
-        if (params.prompt.find(mtmd_default_marker()) == std::string::npos) {
-            for (size_t i = 0; i < params.image.size(); i++) {
-                params.prompt += mtmd_default_marker();
-            }
-        }
-        common_chat_msg msg;
-        msg.role = "user";
-        msg.content = params.prompt;
+
+        // 1) load all media first
+        size_t n_loaded_media = 0;
         for (const auto & image : params.image) {
             if (!ctx.load_media(image)) {
                 return 1; // error is already printed by libmtmd
             }
+            n_loaded_media += 1;
+        }
+        for (const auto & vpath : params.video) {
+            // for video understanding: disable UHD slicing (overview only)
+            mtmd_set_minicpmv_max_slice_nums(ctx.ctx_vision.get(), 0);
+            size_t n = ctx.load_video_path(vpath, /*max_frames*/3, /*stride*/1);
+            if (n == 0) {
+                LOG_ERR("Unable to load video frames from %s\n", vpath.c_str());
+                return 1;
+            }
+            n_loaded_media += n;
+        }
+
+        // 2) build prompt content with correct number of markers
+        std::string prompt_content = params.prompt;
+        if (prompt_content.find(mtmd_default_marker()) == std::string::npos) {
+            for (size_t i = 0; i < n_loaded_media; i++) {
+                prompt_content += mtmd_default_marker();
+            }
         }
+
+        // 3) run
+        common_chat_msg msg;
+        msg.role = "user";
+        msg.content = prompt_content;
         if (eval_message(ctx, msg, true)) {
             return 1;
         }
@@ -318,6 +357,9 @@ int main(int argc, char ** argv) {
         if (mtmd_support_audio(ctx.ctx_vision.get())) {
             LOG("\n   /audio <path>    load an audio");
         }
+        if (mtmd_support_vision(ctx.ctx_vision.get())) {
+            LOG("\n   /video <dir>     load frames from a directory as a video");
+        }
         LOG("\n   /clear           clear the chat history");
         LOG("\n   /quit or /exit   exit the program");
         LOG("\n");
@@ -349,15 +391,37 @@ int main(int argc, char ** argv) {
             g_is_generating = true;
             bool is_image = line == "/image" || line.find("/image ") == 0;
             bool is_audio = line == "/audio" || line.find("/audio ") == 0;
-            if (is_image || is_audio) {
+            bool is_video = line == "/video" || line.find("/video ") == 0;
+            if (is_image || is_audio || is_video) {
                 if (line.size() < 8) {
                     LOG_ERR("ERR: Missing media filename\n");
                     continue;
                 }
                 std::string media_path = line.substr(7);
-                if (ctx.load_media(media_path)) {
-                    LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : "audio");
-                    content += mtmd_default_marker();
+                if (is_video) {
+                    // parse optional args: "/video <dir> [max_frames] [stride]"
+                    // simple split by spaces
+                    std::vector<std::string> parts = string_split(media_path, " ");
+                    std::string dir = parts.size() > 0 ? parts[0] : media_path;
+                    int max_frames = 32;
+                    int stride = 1;
+                    if (parts.size() > 1) max_frames = std::max(1, atoi(parts[1].c_str()));
+                    if (parts.size() > 2) stride     = std::max(1, atoi(parts[2].c_str()));
+                    size_t n = ctx.load_video_path(dir, max_frames, stride);
+                    if (n > 0) {
+                        LOG("%s video loaded with %zu frames\n", dir.c_str(), n);
+                        // add one marker per frame to match mtmd_tokenize expectations
+                        for (size_t i = 0; i < n; ++i) {
+                            content += mtmd_default_marker();
+                        }
+                    } else {
+                        LOG_ERR("ERR: failed to load video frames from %s\n", dir.c_str());
+                    }
+                } else {
+                    if (ctx.load_media(media_path)) {
+                        LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : "audio");
+                        content += mtmd_default_marker();
+                    }
                 }
                 // else, error is already printed by libmtmd
                 continue;