Skip to content

Commit e377686

Browse files
committed
refactor: support video in media
1 parent f09abc0 commit e377686

File tree

6 files changed

+262
-204
lines changed

6 files changed

+262
-204
lines changed

tools/mtmd/mtmd-cli.cpp

Lines changed: 7 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -161,12 +161,6 @@ struct mtmd_cli_context {
161161
bitmaps.entries.push_back(std::move(bmp));
162162
return true;
163163
}
164-
165-
// Load multiple frames from a video file or a directory as a "video" (sequence of images)
166-
// Returns number of frames appended
167-
size_t load_video(const std::string & path) {
168-
return mtmd_video::append_frames_from_path(ctx_vision.get(), path, bitmaps);
169-
}
170164
};
171165

172166
static int generate_response(mtmd_cli_context & ctx, int n_predict) {
@@ -309,14 +303,10 @@ int main(int argc, char ** argv) {
309303
n_loaded_media += 1;
310304
}
311305
for (const auto & vpath : params.video) {
312-
// for video understanding: disable UHD slicing (overview only)
313-
mtmd_set_minicpmv_max_slice_nums(ctx.ctx_vision.get(), 0);
314-
size_t n = ctx.load_video(vpath);
315-
if (n == 0) {
316-
LOG_ERR("Unable to load video frames from %s\n", vpath.c_str());
317-
return 1;
306+
if (!ctx.load_media(vpath)) {
307+
return 1; // error is already printed by libmtmd
318308
}
319-
n_loaded_media += n;
309+
n_loaded_media += 1;
320310
}
321311

322312
// 2) build prompt content with correct number of markers
@@ -347,7 +337,7 @@ int main(int argc, char ** argv) {
347337
LOG("\n /audio <path> load an audio");
348338
}
349339
if (mtmd_support_vision(ctx.ctx_vision.get())) {
350-
LOG("\n /video <dir> load frames from a directory as a video");
340+
LOG("\n /video <path> load a video");
351341
}
352342
LOG("\n /clear clear the chat history");
353343
LOG("\n /quit or /exit exit the program");
@@ -387,26 +377,9 @@ int main(int argc, char ** argv) {
387377
continue;
388378
}
389379
std::string media_path = line.substr(7);
390-
if (is_video) {
391-
// parse optional args: "/video <file/dir path>"
392-
// simple split by spaces
393-
std::vector<std::string> parts = string_split(media_path, " ");
394-
std::string path = parts.size() > 0 ? parts[0] : media_path;
395-
size_t n = ctx.load_video(path);
396-
if (n > 0) {
397-
LOG("%s video loaded with %zu frames\n", path.c_str(), n);
398-
// add one marker per frame to match mtmd_tokenize expectations
399-
for (size_t i = 0; i < n; ++i) {
400-
content += mtmd_default_marker();
401-
}
402-
} else {
403-
LOG_ERR("ERR: failed to load video frames from %s\n", path.c_str());
404-
}
405-
} else {
406-
if (ctx.load_media(media_path)) {
407-
LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : "audio");
408-
content += mtmd_default_marker();
409-
}
380+
if (ctx.load_media(media_path)) {
381+
LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : (is_audio ? "audio" : "video"));
382+
content += mtmd_default_marker();
410383
}
411384
// else, error is already printed by libmtmd
412385
continue;

tools/mtmd/mtmd-helper.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
#include "mtmd-helper.h"
1212
#include "llama.h"
1313

14+
#include "mtmd-video.h"
15+
1416
#include <algorithm>
1517
#include <cinttypes>
1618
#include <vector>
@@ -437,6 +439,10 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigne
437439
}
438440

439441
mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname) {
442+
if(mtmd_video::is_video_file(fname)){
443+
return mtmd_video::init_video_bitmap_from_path(ctx, fname);
444+
}
445+
440446
std::vector<unsigned char> buf;
441447
FILE * f = fopen(fname, "rb");
442448
if (!f) {

tools/mtmd/mtmd-video.cpp

Lines changed: 61 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -90,45 +90,58 @@ static void list_files(const std::string & dir, std::vector<std::string> & out,
9090

9191
namespace mtmd_video {
9292

93-
bool load_frames_from_dir(mtmd_context * ctx,
93+
bool is_video_file(const std::string & path){
94+
auto lower = path;
95+
std::transform(lower.begin(), lower.end(), lower.begin(), [](unsigned char c){ return (char)std::tolower(c); });
96+
return lower.rfind(".mp4") != std::string::npos ||
97+
lower.rfind(".mov") != std::string::npos ||
98+
lower.rfind(".mkv") != std::string::npos ||
99+
lower.rfind(".avi") != std::string::npos ||
100+
lower.rfind(".webm") != std::string::npos;
101+
}
102+
103+
// untested
104+
static mtmd_bitmap* load_frames_from_dir(mtmd_context * ctx,
94105
const std::string & dir_path,
95-
std::vector<mtmd::bitmap> & out_frames,
96106
const LoadVideoOptions & opts) {
97-
if (!ctx || dir_path.empty() || !is_dir(dir_path)) {
98-
return false;
107+
if (!ctx || dir_path.empty() || !is_dir(dir_path) || opts.max_frames < 1) {
108+
return nullptr;
99109
}
100110
// note: hparam-based control is applied inside clip.cpp; nothing to set globally here
101111

102112
std::vector<std::string> files;
103113
list_files(dir_path, files, opts.recursive);
104114
std::sort(files.begin(), files.end());
105115

106-
int stride = std::max(1, opts.stride);
107-
int loaded = 0;
116+
auto stride = std::max(1u, opts.stride);
117+
size_t loaded = 0;
118+
unsigned char* dest = nullptr;
119+
mtmd_bitmap* out_frames = nullptr;
120+
121+
uint32_t w=0, h=0;
108122
for (size_t i = 0; i < files.size(); i++) {
109-
if ((int)i % stride != 0) continue;
123+
if (i % stride != 0) continue;
110124
const std::string & f = files[i];
111125
if (!has_image_ext(f)) continue;
112126
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx, f.c_str()));
113127
if (!bmp.ptr) continue;
114-
out_frames.push_back(std::move(bmp));
128+
if(loaded==0){
129+
w = bmp.nx();
130+
h = bmp.ny();
131+
out_frames = mtmd_bitmap_init_from_video(w, h, loaded, nullptr);
132+
dest = mtmd_bitmap_get_data_mutable(out_frames);
133+
}else if(bmp.nx() != w || bmp.ny() != h){
134+
return nullptr; // all frames must have the same size
135+
}
136+
std::memcpy(dest,
137+
bmp.data(),
138+
bmp.n_bytes());
139+
dest += bmp.n_bytes();
115140
loaded++;
116-
if (opts.max_frames > 0 && loaded >= opts.max_frames) break;
117-
}
118-
return loaded > 0;
119-
}
120-
121-
size_t append_frames_from_dir(mtmd_context * ctx,
122-
const std::string & dir_path,
123-
mtmd::bitmaps & dst,
124-
const LoadVideoOptions & opts) {
125-
std::vector<mtmd::bitmap> frames;
126-
if (!load_frames_from_dir(ctx, dir_path, frames, opts)) {
127-
return 0;
141+
if (loaded >= opts.max_frames) break;
128142
}
129-
size_t before = dst.entries.size();
130-
for (auto & f : frames) dst.entries.push_back(std::move(f));
131-
return dst.entries.size() - before;
143+
144+
return out_frames;
132145
}
133146

134147
// --- FFmpeg-based file decoding (optional) ---
@@ -244,49 +257,51 @@ static bool decode_video_ffmpeg_to_rgba(const std::string & file,
244257
return taken > 0;
245258
}
246259

247-
bool load_frames_from_file(mtmd_context * ctx,
260+
static mtmd_bitmap* load_frames_from_file(mtmd_context * ctx,
248261
const std::string & file_path,
249-
std::vector<mtmd::bitmap> & out_frames,
250262
const LoadVideoOptions & opts) {
251-
if (!ctx) return false;
263+
if (!ctx) return nullptr;
252264
std::vector<DecodedFrameRGBA> decoded;
253-
if (!decode_video_ffmpeg_to_rgba(file_path, decoded, opts.max_frames, std::max(1, opts.stride))) {
254-
return false;
265+
if (!decode_video_ffmpeg_to_rgba(file_path, decoded, opts.max_frames, std::max(1u, opts.stride))) {
266+
return nullptr;
255267
}
268+
const size_t nframes = decoded.size();
269+
if(nframes < 1){
270+
return nullptr;
271+
}
272+
const int w = decoded[0].width;
273+
const int h = decoded[0].height;
274+
mtmd_bitmap* out_frames = mtmd_bitmap_init_from_video(uint32_t(w), uint32_t(h), uint32_t(nframes), nullptr);
275+
unsigned char * dst = mtmd_bitmap_get_data_mutable(out_frames);
276+
256277
for (auto & fr : decoded) {
257-
const int w = fr.width;
258-
const int h = fr.height;
259-
std::vector<unsigned char> rgb;
260-
rgb.resize((size_t)w * h * 3);
278+
GGML_ASSERT(w == fr.width && h == fr.height);
261279
const unsigned char * src = fr.rgba.data();
262-
unsigned char * dst = rgb.data();
263280
for (int i = 0; i < w * h; ++i) {
264281
dst[0] = src[0]; // R
265282
dst[1] = src[1]; // G
266283
dst[2] = src[2]; // B
267284
dst += 3;
268285
src += 4; // skip A
269286
}
270-
mtmd::bitmap bmp(mtmd_bitmap_init((uint32_t)w, (uint32_t)h, rgb.data()));
271-
if (bmp.ptr) out_frames.push_back(std::move(bmp));
272287
}
273-
return !out_frames.empty();
288+
289+
return out_frames;
274290
}
275291
#else
276-
bool load_frames_from_file(mtmd_context * /*ctx*/,
292+
static mtmd_bitmap* load_frames_from_file(mtmd_context * /*ctx*/,
277293
const std::string & /*file_path*/,
278-
std::vector<mtmd::bitmap> & /*out_frames*/,
279294
const LoadVideoOptions & /*opts*/) {
280-
return false;
295+
return nullptr;
281296
}
282297
bool get_video_info_ffmpeg(const std::string &file, VideoInfo &info) {
298+
LOG_ERR("FFmpeg support is not enabled in this build\n");
283299
return false;
284300
}
285301
#endif
286302

287-
size_t append_frames_from_path(mtmd_context * ctx,
288-
const std::string & path,
289-
mtmd::bitmaps & dst) {
303+
mtmd_bitmap* init_video_bitmap_from_path(mtmd_context * ctx,
304+
const std::string & path) {
290305
mtmd_video::LoadVideoOptions opts;
291306
opts.max_frames = 32;
292307
opts.stride = 1;
@@ -300,7 +315,7 @@ size_t append_frames_from_path(mtmd_context * ctx,
300315
info.total_frames = files.size();
301316
} else {
302317
if(!mtmd_video::get_video_info_ffmpeg(path, info)) {
303-
return 0;
318+
return nullptr;
304319
}
305320
}
306321

@@ -316,14 +331,10 @@ size_t append_frames_from_path(mtmd_context * ctx,
316331
}
317332

318333
if (is_dir(path)) {
319-
return append_frames_from_dir(ctx, path, dst, opts);
320-
} else {
321-
std::vector<mtmd::bitmap> frames;
322-
if (!load_frames_from_file(ctx, path, frames, opts)) return 0;
323-
size_t before = dst.entries.size();
324-
for (auto & f : frames) dst.entries.push_back(std::move(f));
325-
return dst.entries.size() - before;
334+
return load_frames_from_dir(ctx, path, opts);
326335
}
336+
337+
return load_frames_from_file(ctx, path, opts);
327338
}
328339

329340
} // namespace mtmd_video

tools/mtmd/mtmd-video.h

Lines changed: 9 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,16 @@
77

88
#include "mtmd.h"
99

10+
#include <cstddef>
11+
#include <cstdint>
1012
#include <string>
1113
#include <vector>
1214

1315
namespace mtmd_video {
1416

1517
struct LoadVideoOptions {
16-
int max_frames = 32; // maximum frames to load (<= 0 means no limit)
17-
int stride = 1; // take every N-th frame
18+
uint32_t max_frames = 32; // maximum frames to load (<= 0 means no limit)
19+
uint32_t stride = 1; // take every N-th frame
1820
bool recursive = false;// scan subdirectories
1921
};
2022

@@ -23,36 +25,17 @@ struct VideoInfo {
2325
int64_t total_frames = 0; // total number of frames
2426
};
2527

26-
// Load frames from a directory of images (jpg/png/bmp/webp etc.)
27-
// Returns true on success (>=1 frame loaded), false otherwise.
28-
bool load_frames_from_dir(mtmd_context * ctx,
29-
const std::string & dir_path,
30-
std::vector<mtmd::bitmap> & out_frames,
31-
const LoadVideoOptions & opts = {});
32-
33-
// Append frames loaded from a directory into mtmd::bitmaps container.
34-
// Returns number of frames appended (0 on failure).
35-
size_t append_frames_from_dir(mtmd_context * ctx,
36-
const std::string & dir_path,
37-
mtmd::bitmaps & dst,
38-
const LoadVideoOptions & opts = {});
28+
// Check if a path is a video file based on its extension
29+
bool is_video_file(const std::string & path);
3930

4031
// get video info (fps, total frames) via FFmpeg (mp4/mov/mkv/avi/webm...).
4132
// Returns true on success, false otherwise.
4233
bool get_video_info_ffmpeg(const std::string &file, VideoInfo &info);
4334

44-
// Load frames from a video file via FFmpeg (mp4/mov/mkv/avi/webm...).
45-
// Returns true on success (>=1 frame loaded), false otherwise.
46-
bool load_frames_from_file(mtmd_context * ctx,
47-
const std::string & file_path,
48-
std::vector<mtmd::bitmap> & out_frames,
49-
const LoadVideoOptions & opts = {});
50-
5135
// Append frames loaded from a file or directory (auto-detect).
52-
// Returns number of frames appended (0 on failure or unsupported input).
53-
size_t append_frames_from_path(mtmd_context * ctx,
54-
const std::string & path,
55-
mtmd::bitmaps & dst);
36+
// Returns a mtmd_bitmap containing all frames in RGB format.
37+
mtmd_bitmap* init_video_bitmap_from_path(mtmd_context * ctx,
38+
const std::string & path);
5639

5740
} // namespace mtmd_video
5841

0 commit comments

Comments
 (0)