Skip to content

Commit ae099ec

Browse files
committed
add video
Signed-off-by: tc-mb <caitianchi@modelbest.cn>
1 parent 4b9f4cb commit ae099ec

File tree

10 files changed

+493
-65
lines changed

10 files changed

+493
-65
lines changed

common/arg.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2514,6 +2514,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
25142514
params.image.emplace_back(value);
25152515
}
25162516
).set_examples({LLAMA_EXAMPLE_MTMD}));
2517+
add_opt(common_arg(
2518+
{"--video"}, "PATH",
2519+
"path to a video file (requires FFmpeg at build time) or a directory of frames; can be repeated.\n",
2520+
[](common_params & params, const std::string & value) {
2521+
params.video.emplace_back(value);
2522+
}
2523+
).set_examples({LLAMA_EXAMPLE_MTMD}));
25172524
if (llama_supports_rpc()) {
25182525
add_opt(common_arg(
25192526
{"--rpc"}, "SERVERS",

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,7 @@ struct common_params {
405405
bool mmproj_use_gpu = true; // use GPU for multimodal model
406406
bool no_mmproj = false; // explicitly disable multimodal model
407407
std::vector<std::string> image; // path to image file(s)
408+
std::vector<std::string> video; // path to video file(s) or frame directories
408409

409410
// finetune
410411
struct lr_opt lr;

tools/mtmd/CMakeLists.txt

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,14 @@ find_package(Threads REQUIRED)
55
add_library(mtmd
66
mtmd.cpp
77
mtmd-audio.cpp
8+
mtmd-video.cpp
89
mtmd.h
910
clip.cpp
1011
clip.h
1112
clip-impl.h
1213
mtmd-helper.cpp
1314
mtmd-helper.h
15+
mtmd-video.h
1416
)
1517

1618
target_link_libraries (mtmd PUBLIC ggml llama)
@@ -20,6 +22,23 @@ target_include_directories(mtmd PRIVATE ../..)
2022
target_include_directories(mtmd PRIVATE ../../vendor)
2123
target_compile_features (mtmd PRIVATE cxx_std_17)
2224

25+
# Optional FFmpeg support for video decoding
26+
option(MTMD_WITH_FFMPEG "Enable FFmpeg-based video decoding in mtmd-video" OFF)
27+
if (MTMD_WITH_FFMPEG)
28+
find_package(PkgConfig QUIET)
29+
if (PKG_CONFIG_FOUND)
30+
pkg_check_modules(FFMPEG QUIET IMPORTED_TARGET libavformat libavcodec libswscale libavutil)
31+
if (FFMPEG_FOUND)
32+
target_link_libraries(mtmd PRIVATE PkgConfig::FFMPEG)
33+
target_compile_definitions(mtmd PRIVATE MTMD_WITH_FFMPEG)
34+
else()
35+
message(WARNING "FFmpeg not found via pkg-config; MTMD_WITH_FFMPEG disabled")
36+
endif()
37+
else()
38+
message(WARNING "pkg-config not found; MTMD_WITH_FFMPEG disabled")
39+
endif()
40+
endif()
41+
2342
if (BUILD_SHARED_LIBS)
2443
set_target_properties (mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
2544
target_compile_definitions(mtmd PRIVATE LLAMA_BUILD)
@@ -29,6 +48,7 @@ endif()
2948
set(MTMD_PUBLIC_HEADERS
3049
${CMAKE_CURRENT_SOURCE_DIR}/mtmd.h
3150
${CMAKE_CURRENT_SOURCE_DIR}/mtmd-helper.h
51+
${CMAKE_CURRENT_SOURCE_DIR}/mtmd-video.h
3252
)
3353

3454
set_target_properties(mtmd

tools/mtmd/clip.cpp

Lines changed: 64 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,7 @@ struct clip_hparams {
201201
// legacy
202202
bool has_llava_projector = false;
203203
int minicpmv_version = 0;
204+
int minicpmv_max_slice_nums = 9;
204205
int32_t minicpmv_query_num = 0; // MiniCPM-V query number
205206
};
206207

@@ -3260,16 +3261,67 @@ struct llava_uhd {
32603261
const bool has_slices = original_size.width > slice_size || original_size.height > slice_size;
32613262
const bool has_pinpoints = !ctx->model.hparams.image_res_candidates.empty();
32623263

3263-
if (!has_slices) {
3264-
// skip slicing logic
3265-
res.overview_size = clip_image_size{slice_size, slice_size};
3266-
res.refined_size = clip_image_size{0, 0};
3267-
res.grid_size = clip_image_size{0, 0};
3264+
if (clip_is_minicpmv(ctx)) {
3265+
auto best_size = get_best_resize(original_size, slice_size, patch_size, !has_slices);
3266+
res.overview_size = best_size;
3267+
3268+
{
3269+
const int max_slice_nums = ctx->model.hparams.minicpmv_max_slice_nums;
3270+
const float log_ratio = log((float)original_width / original_height);
3271+
const float ratio = (float)original_width * original_height / (slice_size * slice_size);
3272+
const int multiple = fmin(ceil(ratio), max_slice_nums);
3273+
3274+
auto best_grid = get_best_grid(max_slice_nums, multiple, log_ratio);
3275+
auto refine_size = get_refine_size(original_size, best_grid, slice_size, patch_size, true);
3276+
res.grid_size = best_grid;
3277+
res.refined_size = refine_size;
3278+
3279+
LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n",
3280+
__func__, original_width, original_height,
3281+
res.overview_size.width, res.overview_size.height,
3282+
res.refined_size.width, res.refined_size.height,
3283+
res.grid_size.width, res.grid_size.height);
3284+
3285+
if (!has_slices || max_slice_nums == 0) {
3286+
return res;
3287+
}
3288+
3289+
int width = refine_size.width;
3290+
int height = refine_size.height;
3291+
int grid_x = int(width / best_grid.width);
3292+
int grid_y = int(height / best_grid.height);
3293+
for (int patches_y = 0, ic = 0;
3294+
patches_y < refine_size.height && ic < best_grid.height;
3295+
patches_y += grid_y, ic += 1) {
3296+
for (int patches_x = 0, jc = 0;
3297+
patches_x < refine_size.width && jc < best_grid.width;
3298+
patches_x += grid_x, jc += 1) {
3299+
slice_coordinates slice;
3300+
slice.x = patches_x;
3301+
slice.y = patches_y;
3302+
slice.size.width = grid_x;
3303+
slice.size.height = grid_y;
3304+
res.slices.push_back(slice);
3305+
LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n",
3306+
__func__, (int)res.slices.size() - 1,
3307+
slice.x, slice.y, slice.size.width, slice.size.height);
3308+
}
3309+
}
3310+
}
32683311

32693312
return res;
32703313
}
3314+
else {
3315+
if (!has_slices) {
3316+
// skip slicing logic
3317+
res.overview_size = clip_image_size{slice_size, slice_size};
3318+
res.refined_size = clip_image_size{0, 0};
3319+
res.grid_size = clip_image_size{0, 0};
32713320

3272-
if (has_pinpoints) {
3321+
return res;
3322+
}
3323+
3324+
if (has_pinpoints) {
32733325
// has pinpoints, use them to calculate the grid size (e.g. llava-1.6)
32743326
auto refine_size = llava_uhd::select_best_resolution(
32753327
original_size,
@@ -3305,53 +3357,7 @@ struct llava_uhd {
33053357

33063358
return res;
33073359
}
3308-
3309-
// no pinpoints, dynamically calculate the grid size (e.g. minicpmv)
3310-
3311-
auto best_size = get_best_resize(original_size, slice_size, patch_size, !has_slices);
3312-
res.overview_size = best_size;
3313-
3314-
{
3315-
const int max_slice_nums = 9; // TODO: this is only used by minicpmv, maybe remove it
3316-
const float log_ratio = log((float)original_width / original_height);
3317-
const float ratio = (float)original_width * original_height / (slice_size * slice_size);
3318-
const int multiple = fmin(ceil(ratio), max_slice_nums);
3319-
3320-
auto best_grid = get_best_grid(max_slice_nums, multiple, log_ratio);
3321-
auto refine_size = get_refine_size(original_size, best_grid, slice_size, patch_size, true);
3322-
res.grid_size = best_grid;
3323-
res.refined_size = refine_size;
3324-
3325-
LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n",
3326-
__func__, original_width, original_height,
3327-
res.overview_size.width, res.overview_size.height,
3328-
res.refined_size.width, res.refined_size.height,
3329-
res.grid_size.width, res.grid_size.height);
3330-
3331-
int width = refine_size.width;
3332-
int height = refine_size.height;
3333-
int grid_x = int(width / best_grid.width);
3334-
int grid_y = int(height / best_grid.height);
3335-
for (int patches_y = 0, ic = 0;
3336-
patches_y < refine_size.height && ic < best_grid.height;
3337-
patches_y += grid_y, ic += 1) {
3338-
for (int patches_x = 0, jc = 0;
3339-
patches_x < refine_size.width && jc < best_grid.width;
3340-
patches_x += grid_x, jc += 1) {
3341-
slice_coordinates slice;
3342-
slice.x = patches_x;
3343-
slice.y = patches_y;
3344-
slice.size.width = grid_x;
3345-
slice.size.height = grid_y;
3346-
res.slices.push_back(slice);
3347-
LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n",
3348-
__func__, (int)res.slices.size() - 1,
3349-
slice.x, slice.y, slice.size.width, slice.size.height);
3350-
}
3351-
}
33523360
}
3353-
3354-
return res;
33553361
}
33563362

33573363
static std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 * img, const slice_instructions & inst) {
@@ -4390,6 +4396,12 @@ bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
43904396
|| ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL;
43914397
}
43924398

4399+
void clip_set_minicpmv_max_slice_nums(struct clip_ctx * ctx, int n) {
4400+
if (!ctx) return;
4401+
if (n < 0) n = 0;
4402+
ctx->model.hparams.minicpmv_max_slice_nums = n;
4403+
}
4404+
43934405
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
43944406
clip_image_f32 clip_img;
43954407
clip_img.buf.resize(h * w * 3);

tools/mtmd/clip.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_i
9191
bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
9292

9393
int clip_is_minicpmv(const struct clip_ctx * ctx);
94+
void clip_set_minicpmv_max_slice_nums(struct clip_ctx * ctx, int n);
9495
bool clip_is_glm(const struct clip_ctx * ctx);
9596
bool clip_is_qwen2vl(const struct clip_ctx * ctx);
9697
bool clip_is_llava(const struct clip_ctx * ctx);

tools/mtmd/mtmd-cli.cpp

Lines changed: 77 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,15 @@
66
#include "ggml.h"
77
#include "console.h"
88
#include "chat.h"
9+
#include "clip.h"
910
#include "mtmd.h"
1011
#include "mtmd-helper.h"
12+
#include "mtmd-video.h"
1113

1214
#include <vector>
1315
#include <limits.h>
1416
#include <cinttypes>
17+
#include <cstdlib>
1518

1619
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
1720
#include <signal.h>
@@ -158,6 +161,23 @@ struct mtmd_cli_context {
158161
bitmaps.entries.push_back(std::move(bmp));
159162
return true;
160163
}
164+
165+
// Load multiple frames from a directory as a "video" (sequence of images)
166+
// Returns number of frames appended
167+
size_t load_video_dir(const std::string & dir, int max_frames = 32, int stride = 1, bool recursive = false) {
168+
mtmd_video::LoadVideoOptions opts;
169+
opts.max_frames = max_frames;
170+
opts.stride = stride;
171+
opts.recursive = recursive;
172+
return mtmd_video::append_frames_from_dir(ctx_vision.get(), dir, bitmaps, opts);
173+
}
174+
175+
size_t load_video_path(const std::string & path, int max_frames = 32, int stride = 1) {
176+
mtmd_video::LoadVideoOptions opts;
177+
opts.max_frames = max_frames;
178+
opts.stride = stride;
179+
return mtmd_video::append_frames_from_path(ctx_vision.get(), path, bitmaps, opts);
180+
}
161181
};
162182

163183
static int generate_response(mtmd_cli_context & ctx, int n_predict) {
@@ -266,7 +286,7 @@ int main(int argc, char ** argv) {
266286
mtmd_cli_context ctx(params);
267287
LOG("%s: loading model: %s\n", __func__, params.model.path.c_str());
268288

269-
bool is_single_turn = !params.prompt.empty() && !params.image.empty();
289+
bool is_single_turn = !params.prompt.empty() && (!params.image.empty() || !params.video.empty());
270290

271291
int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict;
272292

@@ -290,19 +310,38 @@ int main(int argc, char ** argv) {
290310

291311
if (is_single_turn) {
292312
g_is_generating = true;
293-
if (params.prompt.find(mtmd_default_marker()) == std::string::npos) {
294-
for (size_t i = 0; i < params.image.size(); i++) {
295-
params.prompt += mtmd_default_marker();
296-
}
297-
}
298-
common_chat_msg msg;
299-
msg.role = "user";
300-
msg.content = params.prompt;
313+
314+
// 1) load all media first
315+
size_t n_loaded_media = 0;
301316
for (const auto & image : params.image) {
302317
if (!ctx.load_media(image)) {
303318
return 1; // error is already printed by libmtmd
304319
}
320+
n_loaded_media += 1;
321+
}
322+
for (const auto & vpath : params.video) {
323+
// for video understanding: disable UHD slicing (overview only)
324+
mtmd_set_minicpmv_max_slice_nums(ctx.ctx_vision.get(), 0);
325+
size_t n = ctx.load_video_path(vpath, /*max_frames*/3, /*stride*/1);
326+
if (n == 0) {
327+
LOG_ERR("Unable to load video frames from %s\n", vpath.c_str());
328+
return 1;
329+
}
330+
n_loaded_media += n;
331+
}
332+
333+
// 2) build prompt content with correct number of markers
334+
std::string prompt_content = params.prompt;
335+
if (prompt_content.find(mtmd_default_marker()) == std::string::npos) {
336+
for (size_t i = 0; i < n_loaded_media; i++) {
337+
prompt_content += mtmd_default_marker();
338+
}
305339
}
340+
341+
// 3) run
342+
common_chat_msg msg;
343+
msg.role = "user";
344+
msg.content = prompt_content;
306345
if (eval_message(ctx, msg, true)) {
307346
return 1;
308347
}
@@ -318,6 +357,9 @@ int main(int argc, char ** argv) {
318357
if (mtmd_support_audio(ctx.ctx_vision.get())) {
319358
LOG("\n /audio <path> load an audio");
320359
}
360+
if (mtmd_support_vision(ctx.ctx_vision.get())) {
361+
LOG("\n /video <dir> load frames from a directory as a video");
362+
}
321363
LOG("\n /clear clear the chat history");
322364
LOG("\n /quit or /exit exit the program");
323365
LOG("\n");
@@ -349,15 +391,37 @@ int main(int argc, char ** argv) {
349391
g_is_generating = true;
350392
bool is_image = line == "/image" || line.find("/image ") == 0;
351393
bool is_audio = line == "/audio" || line.find("/audio ") == 0;
352-
if (is_image || is_audio) {
394+
bool is_video = line == "/video" || line.find("/video ") == 0;
395+
if (is_image || is_audio || is_video) {
353396
if (line.size() < 8) {
354397
LOG_ERR("ERR: Missing media filename\n");
355398
continue;
356399
}
357400
std::string media_path = line.substr(7);
358-
if (ctx.load_media(media_path)) {
359-
LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : "audio");
360-
content += mtmd_default_marker();
401+
if (is_video) {
402+
// parse optional args: "/video <dir> [max_frames] [stride]"
403+
// simple split by spaces
404+
std::vector<std::string> parts = string_split(media_path, " ");
405+
std::string dir = parts.size() > 0 ? parts[0] : media_path;
406+
int max_frames = 32;
407+
int stride = 1;
408+
if (parts.size() > 1) max_frames = std::max(1, atoi(parts[1].c_str()));
409+
if (parts.size() > 2) stride = std::max(1, atoi(parts[2].c_str()));
410+
size_t n = ctx.load_video_path(dir, max_frames, stride);
411+
if (n > 0) {
412+
LOG("%s video loaded with %zu frames\n", dir.c_str(), n);
413+
// add one marker per frame to match mtmd_tokenize expectations
414+
for (size_t i = 0; i < n; ++i) {
415+
content += mtmd_default_marker();
416+
}
417+
} else {
418+
LOG_ERR("ERR: failed to load video frames from %s\n", dir.c_str());
419+
}
420+
} else {
421+
if (ctx.load_media(media_path)) {
422+
LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : "audio");
423+
content += mtmd_default_marker();
424+
}
361425
}
362426
// else, error is already printed by libmtmd
363427
continue;

0 commit comments

Comments
 (0)