Skip to content

Commit d5b832d

Browse files
committed
feat: support minicpm-v video normal speed
1 parent ae099ec commit d5b832d

File tree

3 files changed

+88
-28
lines changed

3 files changed

+88
-28
lines changed

tools/mtmd/mtmd-cli.cpp

Lines changed: 9 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -162,21 +162,10 @@ struct mtmd_cli_context {
162162
return true;
163163
}
164164

165-
// Load multiple frames from a directory as a "video" (sequence of images)
165+
// Load multiple frames from a video file or a directory as a "video" (sequence of images)
166166
// Returns number of frames appended
167-
size_t load_video_dir(const std::string & dir, int max_frames = 32, int stride = 1, bool recursive = false) {
168-
mtmd_video::LoadVideoOptions opts;
169-
opts.max_frames = max_frames;
170-
opts.stride = stride;
171-
opts.recursive = recursive;
172-
return mtmd_video::append_frames_from_dir(ctx_vision.get(), dir, bitmaps, opts);
173-
}
174-
175-
size_t load_video_path(const std::string & path, int max_frames = 32, int stride = 1) {
176-
mtmd_video::LoadVideoOptions opts;
177-
opts.max_frames = max_frames;
178-
opts.stride = stride;
179-
return mtmd_video::append_frames_from_path(ctx_vision.get(), path, bitmaps, opts);
167+
size_t load_video(const std::string & path) {
168+
return mtmd_video::append_frames_from_path(ctx_vision.get(), path, bitmaps);
180169
}
181170
};
182171

@@ -322,7 +311,7 @@ int main(int argc, char ** argv) {
322311
for (const auto & vpath : params.video) {
323312
// for video understanding: disable UHD slicing (overview only)
324313
mtmd_set_minicpmv_max_slice_nums(ctx.ctx_vision.get(), 0);
325-
size_t n = ctx.load_video_path(vpath, /*max_frames*/3, /*stride*/1);
314+
size_t n = ctx.load_video(vpath);
326315
if (n == 0) {
327316
LOG_ERR("Unable to load video frames from %s\n", vpath.c_str());
328317
return 1;
@@ -399,23 +388,19 @@ int main(int argc, char ** argv) {
399388
}
400389
std::string media_path = line.substr(7);
401390
if (is_video) {
402-
// parse optional args: "/video <dir> [max_frames] [stride]"
391+
// parse optional args: "/video <file/dir path>"
403392
// simple split by spaces
404393
std::vector<std::string> parts = string_split(media_path, " ");
405-
std::string dir = parts.size() > 0 ? parts[0] : media_path;
406-
int max_frames = 32;
407-
int stride = 1;
408-
if (parts.size() > 1) max_frames = std::max(1, atoi(parts[1].c_str()));
409-
if (parts.size() > 2) stride = std::max(1, atoi(parts[2].c_str()));
410-
size_t n = ctx.load_video_path(dir, max_frames, stride);
394+
std::string path = parts.size() > 0 ? parts[0] : media_path;
395+
size_t n = ctx.load_video(path);
411396
if (n > 0) {
412-
LOG("%s video loaded with %zu frames\n", dir.c_str(), n);
397+
LOG("%s video loaded with %zu frames\n", path.c_str(), n);
413398
// add one marker per frame to match mtmd_tokenize expectations
414399
for (size_t i = 0; i < n; ++i) {
415400
content += mtmd_default_marker();
416401
}
417402
} else {
418-
LOG_ERR("ERR: failed to load video frames from %s\n", dir.c_str());
403+
LOG_ERR("ERR: failed to load video frames from %s\n", path.c_str());
419404
}
420405
} else {
421406
if (ctx.load_media(media_path)) {

tools/mtmd/mtmd-video.cpp

Lines changed: 69 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include <cerrno>
1010
#include <cstring>
1111
#include <memory>
12+
#include <cmath>
1213

1314
#if defined(_WIN32)
1415
#define WIN32_LEAN_AND_MEAN
@@ -148,6 +149,44 @@ struct DecodedFrameRGBA {
148149
std::vector<unsigned char> rgba; // size = width * height * 4
149150
};
150151

152+
bool get_video_info_ffmpeg(const std::string &file, VideoInfo &info) {
153+
AVFormatContext *fmt = nullptr;
154+
if (avformat_open_input(&fmt, file.c_str(), nullptr, nullptr) < 0) {
155+
return false;
156+
}
157+
158+
std::unique_ptr<AVFormatContext, void(*)(AVFormatContext*)> fmt_guard(fmt,
159+
[](AVFormatContext *f){ if (f) {avformat_close_input(&f);} });
160+
161+
if (avformat_find_stream_info(fmt, nullptr) < 0) {
162+
return false;
163+
}
164+
165+
// find video stream
166+
int vstream = av_find_best_stream(fmt, AVMEDIA_TYPE_VIDEO, -1, -1, nullptr, 0);
167+
if (vstream < 0) {
168+
return false;
169+
}
170+
171+
AVStream *st = fmt->streams[vstream];
172+
173+
// get fps
174+
if (st->avg_frame_rate.num > 0 && st->avg_frame_rate.den > 0){
175+
info.fps = av_q2d(st->avg_frame_rate);
176+
}else if (st->r_frame_rate.num > 0 && st->r_frame_rate.den > 0){
177+
info.fps = av_q2d(st->r_frame_rate);
178+
}
179+
// get total frames
180+
if (st->nb_frames > 0){
181+
info.total_frames = st->nb_frames;
182+
}else if (fmt->duration > 0 && info.fps > 0.0){
183+
// estimate total frames if nb_frames is not available
184+
info.total_frames = std::llround((fmt->duration / (double)AV_TIME_BASE) * info.fps);
185+
}
186+
187+
return true;
188+
}
189+
151190
static bool decode_video_ffmpeg_to_rgba(const std::string & file,
152191
std::vector<DecodedFrameRGBA> & frames,
153192
int max_frames,
@@ -239,12 +278,40 @@ bool load_frames_from_file(mtmd_context * /*ctx*/,
239278
const LoadVideoOptions & /*opts*/) {
240279
return false;
241280
}
281+
bool get_video_info_ffmpeg(const std::string &file, VideoInfo &info) {
282+
return false;
283+
}
242284
#endif
243285

244286
size_t append_frames_from_path(mtmd_context * ctx,
245287
const std::string & path,
246-
mtmd::bitmaps & dst,
247-
const LoadVideoOptions & opts) {
288+
mtmd::bitmaps & dst) {
289+
mtmd_video::LoadVideoOptions opts;
290+
opts.max_frames = 32;
291+
opts.stride = 1;
292+
opts.recursive = false;
293+
294+
auto info = mtmd_video::VideoInfo{};
295+
if(is_dir(path)) {
296+
info.fps = 1;
297+
std::vector<std::string> files;
298+
list_files(path, files, opts.recursive);
299+
info.total_frames = files.size();
300+
} else {
301+
mtmd_video::get_video_info_ffmpeg(path, info);
302+
}
303+
304+
// minicpm normal speed
305+
const int32_t minicpmv_max_video_frames = 64;
306+
opts.max_frames = minicpmv_max_video_frames;
307+
if(info.total_frames > minicpmv_max_video_frames) {
308+
// uniform sample
309+
opts.stride = (int)std::ceil((double)info.total_frames / minicpmv_max_video_frames);
310+
} else {
311+
// 1 frame per second
312+
opts.stride = (info.fps > 1.0) ? (int)std::ceil(info.fps) : 1;
313+
}
314+
248315
if (is_dir(path)) {
249316
return append_frames_from_dir(ctx, path, dst, opts);
250317
} else {

tools/mtmd/mtmd-video.h

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,11 @@ struct LoadVideoOptions {
1818
bool recursive = false;// scan subdirectories
1919
};
2020

21+
struct VideoInfo {
22+
double fps = 0.0; // frames per second
23+
int64_t total_frames = 0; // total number of frames
24+
};
25+
2126
// Load frames from a directory of images (jpg/png/bmp/webp etc.)
2227
// Returns true on success (>=1 frame loaded), false otherwise.
2328
bool load_frames_from_dir(mtmd_context * ctx,
@@ -32,6 +37,10 @@ size_t append_frames_from_dir(mtmd_context * ctx,
3237
mtmd::bitmaps & dst,
3338
const LoadVideoOptions & opts = {});
3439

40+
// get video info (fps, total frames) via FFmpeg (mp4/mov/mkv/avi/webm...).
41+
// Returns true on success, false otherwise.
42+
bool get_video_info_ffmpeg(const std::string &file, VideoInfo &info);
43+
3544
// Load frames from a video file via FFmpeg (mp4/mov/mkv/avi/webm...).
3645
// Returns true on success (>=1 frame loaded), false otherwise.
3746
bool load_frames_from_file(mtmd_context * ctx,
@@ -43,8 +52,7 @@ bool load_frames_from_file(mtmd_context * ctx,
4352
// Returns number of frames appended (0 on failure or unsupported input).
4453
size_t append_frames_from_path(mtmd_context * ctx,
4554
const std::string & path,
46-
mtmd::bitmaps & dst,
47-
const LoadVideoOptions & opts = {});
55+
mtmd::bitmaps & dst);
4856

4957
} // namespace mtmd_video
5058

0 commit comments

Comments
 (0)