Skip to content

Commit 6e8c9f6

Browse files
committed
refactor: get_video_info and decode_video use the same AVFormatContext
1 parent 5ee744f commit 6e8c9f6

File tree

2 files changed

+89
-154
lines changed

2 files changed

+89
-154
lines changed

tools/mtmd/mtmd-video.cpp

Lines changed: 89 additions & 149 deletions
Original file line numberDiff line numberDiff line change
@@ -25,12 +25,11 @@ bool is_video_file(const std::string & path){
2525
lower.rfind(".webm") != std::string::npos;
2626
}
2727

28-
static bool get_video_info_from_dir(const std::string &path, VideoInfo &info){
28+
static void get_video_info_from_dir(const std::string &path, VideoInfo &info){
2929
info.fps = 1; // do not care
3030
std::vector<std::string> files;
3131
mtmd_helper::list_files(path, files, true); // recursive
3232
info.total_frames = files.size();
33-
return true;
3433
}
3534
// untested
3635
static mtmd_bitmap* load_frames_from_dir(mtmd_context * ctx,
@@ -80,6 +79,32 @@ struct DecodedFrameRGBA {
8079
std::vector<unsigned char> rgba; // size = width * height * 4
8180
};
8281

82+
static mtmd_video::LoadVideoOptions get_video_sample_options(mtmd_video::VideoInfo info){
83+
mtmd_video::LoadVideoOptions opts;
84+
opts.max_frames = 32;
85+
opts.stride = 1;
86+
opts.recursive = false;
87+
88+
/* MiniCPM-V normal-speed video frames sample method */
89+
90+
#ifdef MTMD_MAX_VIDEO_FRAMES_SMALL
91+
// set a small number of frames for fast test locally
92+
const int32_t minicpmv_max_video_frames = 4;
93+
#else
94+
const int32_t minicpmv_max_video_frames = 64;
95+
#endif
96+
opts.max_frames = minicpmv_max_video_frames;
97+
if(info.total_frames > minicpmv_max_video_frames) {
98+
// uniform sample
99+
opts.stride = (int)std::ceil((double)info.total_frames / minicpmv_max_video_frames);
100+
} else {
101+
// 1 frame per second
102+
opts.stride = (info.fps > 1.0) ? (int)std::ceil(info.fps) : 1;
103+
}
104+
105+
return opts;
106+
}
107+
83108
// --- FFmpeg-based file decoding (optional) ---
84109

85110
#ifdef MTMD_WITH_FFMPEG
@@ -165,11 +190,10 @@ static int64_t seek_packet(void* opaque, int64_t offset, int whence) {
165190

166191
static bool create_format_context_from_buffer(const uint8_t* buffer, size_t size,
167192
AVFormatContext*& fmt,
168-
AVIOContext*& avio_ctx,
169-
uint8_t*& avio_ctx_buffer) {
193+
AVIOContext*& avio_ctx) {
170194
fmt = nullptr;
171195
avio_ctx = nullptr;
172-
avio_ctx_buffer = nullptr;
196+
uint8_t* avio_ctx_buffer = nullptr;
173197

174198
if (!buffer || size == 0) return false;
175199

@@ -312,39 +336,13 @@ static bool get_video_info_from_format_ctx(AVFormatContext *fmt, VideoInfo &info
312336
return true;
313337
}
314338

315-
// from buffer
316-
bool get_video_info(const uint8_t* buffer, size_t size, VideoInfo &info) {
317-
AVFormatContext* fmt = nullptr;
318-
AVIOContext* avio_ctx = nullptr;
319-
uint8_t* avio_ctx_buffer = nullptr;
320-
321-
GGML_ASSERT(create_format_context_from_buffer(buffer, size, fmt, avio_ctx, avio_ctx_buffer));
322-
bool ok = get_video_info_from_format_ctx(fmt, info);
323-
free_format_context_from_buffer(fmt, avio_ctx);
324-
return ok;
325-
}
326-
327-
// from file
328-
bool get_video_info(const std::string &path, VideoInfo &info) {
329-
if(mtmd_helper::is_dir(path)) return get_video_info_from_dir(path, info);
330-
331-
AVFormatContext* fmt = nullptr;
332-
if (avformat_open_input(&fmt, path.c_str(), nullptr, nullptr) < 0)
333-
return false;
334-
335-
std::unique_ptr<AVFormatContext, void(*)(AVFormatContext*)> fmt_guard(fmt, [](AVFormatContext* f){
336-
if (f) avformat_close_input(&f);
337-
});
338-
339-
return get_video_info_from_format_ctx(fmt, info);
340-
}
341-
342339
static bool decode_video_ffmpeg_to_rgba_from_format_ctx(
343340
AVFormatContext* fmt,
344341
std::vector<DecodedFrameRGBA>& frames,
345-
int max_frames,
346-
int stride)
342+
mtmd_video::LoadVideoOptions opts)
347343
{
344+
const auto stride = opts.stride;
345+
const auto max_frames = opts.max_frames;
348346
if(!fmt || stride <= 0 || max_frames <= 0) return false;
349347
if (avformat_find_stream_info(fmt, nullptr) < 0) return false;
350348
int vstream = av_find_best_stream(fmt, AVMEDIA_TYPE_VIDEO, -1, -1, nullptr, 0);
@@ -364,8 +362,8 @@ static bool decode_video_ffmpeg_to_rgba_from_format_ctx(
364362
std::unique_ptr<AVPacket, void(*)(AVPacket*)> pkt_guard(pkt, [](AVPacket *p){ if (p) av_packet_free(&p); });
365363

366364
SwsContext * sws = nullptr;
367-
int idx = 0;
368-
int taken = 0;
365+
uint32_t idx = 0;
366+
uint32_t taken = 0;
369367
while (av_read_frame(fmt, pkt) >= 0) {
370368
if (pkt->stream_index != vstream) { av_packet_unref(pkt); continue; }
371369
if (avcodec_send_packet(ctx, pkt) < 0) { av_packet_unref(pkt); break; }
@@ -394,77 +392,6 @@ static bool decode_video_ffmpeg_to_rgba_from_format_ctx(
394392
return taken > 0;
395393
}
396394

397-
// from file
398-
static bool decode_video_ffmpeg_to_rgba(
399-
const std::string& file,
400-
std::vector<DecodedFrameRGBA>& frames,
401-
int max_frames,
402-
int stride)
403-
{
404-
AVFormatContext* fmt = nullptr;
405-
if (avformat_open_input(&fmt, file.c_str(), nullptr, nullptr) < 0)
406-
return false;
407-
408-
std::unique_ptr<AVFormatContext, void(*)(AVFormatContext*)> fmt_guard(fmt, [](AVFormatContext* f){
409-
if (f) avformat_close_input(&f);
410-
});
411-
412-
return decode_video_ffmpeg_to_rgba_from_format_ctx(fmt, frames, max_frames, stride);
413-
}
414-
415-
// from buffer
416-
static bool decode_video_ffmpeg_to_rgba(
417-
const uint8_t* buffer,
418-
size_t size,
419-
std::vector<DecodedFrameRGBA>& frames,
420-
int max_frames,
421-
int stride)
422-
{
423-
if (!buffer || size == 0) return false;
424-
AVFormatContext* fmt = nullptr;
425-
AVIOContext* avio_ctx = nullptr;
426-
uint8_t* avio_ctx_buffer = nullptr;
427-
428-
GGML_ASSERT(create_format_context_from_buffer(buffer, size, fmt, avio_ctx, avio_ctx_buffer));
429-
430-
bool ok = decode_video_ffmpeg_to_rgba_from_format_ctx(fmt, frames, max_frames, stride);
431-
432-
free_format_context_from_buffer(fmt, avio_ctx);
433-
return ok;
434-
}
435-
#else
436-
bool get_video_info(const std::string &path, VideoInfo &info){
437-
if(mtmd_helper::is_dir(path)) return get_video_info_from_dir(path, info);
438-
LOG_ERR("FFmpeg support is not enabled in this build\n");
439-
return false;
440-
}
441-
bool get_video_info(const uint8_t* /*buffer*/, size_t /*size*/, VideoInfo &/*info*/){
442-
LOG_ERR("FFmpeg support is not enabled in this build\n");
443-
return false;
444-
}
445-
bool is_video_buffer(const uint8_t */*data*/, size_t /*size*/){
446-
LOG_ERR("FFmpeg support is not enabled in this build\n");
447-
return false;
448-
}
449-
static bool decode_video_ffmpeg_to_rgba(
450-
const std::string& /*file*/,
451-
std::vector<DecodedFrameRGBA>& /*frames*/,
452-
int /*max_frames*/,
453-
int /*stride*/)
454-
{
455-
return false;
456-
}
457-
static bool decode_video_ffmpeg_to_rgba(
458-
const uint8_t* /*buffer*/,
459-
size_t /*size*/,
460-
std::vector<DecodedFrameRGBA>& /*frames*/,
461-
int /*max_frames*/,
462-
int /*stride*/)
463-
{
464-
return false;
465-
}
466-
#endif
467-
468395
static mtmd_bitmap* convert_frames_to_bitmap(mtmd_context * ctx, const std::vector<DecodedFrameRGBA>& decoded) {
469396
if (!ctx) return nullptr;
470397
if(decoded.empty()) return nullptr;
@@ -492,70 +419,83 @@ static mtmd_bitmap* convert_frames_to_bitmap(mtmd_context * ctx, const std::vect
492419
return out_frames;
493420
}
494421

495-
static mtmd_video::LoadVideoOptions get_video_sample_options(mtmd_video::VideoInfo info){
496-
mtmd_video::LoadVideoOptions opts;
497-
opts.max_frames = 32;
498-
opts.stride = 1;
499-
opts.recursive = false;
500-
501-
/* MiniCPM-V normal-speed video frames sample method */
502-
503-
#ifdef MTMD_MAX_VIDEO_FRAMES_SMALL
504-
// set a small number of frames for fast test locally
505-
const int32_t minicpmv_max_video_frames = 4;
506-
#else
507-
const int32_t minicpmv_max_video_frames = 64;
508-
#endif
509-
opts.max_frames = minicpmv_max_video_frames;
510-
if(info.total_frames > minicpmv_max_video_frames) {
511-
// uniform sample
512-
opts.stride = (int)std::ceil((double)info.total_frames / minicpmv_max_video_frames);
513-
} else {
514-
// 1 frame per second
515-
opts.stride = (info.fps > 1.0) ? (int)std::ceil(info.fps) : 1;
516-
}
517-
518-
return opts;
519-
}
520-
521-
mtmd_bitmap* init_video_bitmap(mtmd_context * ctx, const std::string & path) {
422+
mtmd_bitmap* init_video_bitmap(mtmd_context * ctx, const uint8_t* buffer, size_t size){
522423
auto info = mtmd_video::VideoInfo{};
523-
if(!mtmd_video::get_video_info(path, info)) {
524-
LOG_ERR("Unable to get video info from path: %s\n", path.c_str());
424+
AVFormatContext* fmt = nullptr;
425+
AVIOContext* avio_ctx = nullptr;
426+
GGML_ASSERT(create_format_context_from_buffer(buffer, size, fmt, avio_ctx));
427+
428+
if(!get_video_info_from_format_ctx(fmt, info)) {
429+
LOG_ERR("Unable to get video info from buffer\n");
430+
free_format_context_from_buffer(fmt, avio_ctx);
525431
return nullptr;
526432
}
527433

528434
const auto opts = get_video_sample_options(info);
529435

530-
if (mtmd_helper::is_dir(path)) {
531-
return load_frames_from_dir(ctx, path, opts);
532-
}
533-
534436
std::vector<DecodedFrameRGBA> frames;
535-
if(!decode_video_ffmpeg_to_rgba(path, frames, opts.max_frames, std::max(1u, opts.stride))){
536-
LOG_ERR("Unable to decode video from path: %s\n", path.c_str());
437+
if(!decode_video_ffmpeg_to_rgba_from_format_ctx(fmt, frames, opts)){
438+
LOG_ERR("Unable to decode video from buffer\n");
439+
free_format_context_from_buffer(fmt, avio_ctx);
537440
return nullptr;
538441
}
539442

540-
return convert_frames_to_bitmap(ctx, frames);
443+
auto * res = convert_frames_to_bitmap(ctx, frames);
444+
free_format_context_from_buffer(fmt, avio_ctx);
445+
return res;
446+
}
447+
#else
448+
bool is_video_buffer(const uint8_t */*data*/, size_t /*size*/){
449+
LOG_WRN("FFmpeg support is not enabled in this build, can not check it\n");
450+
return false;
451+
}
452+
mtmd_bitmap* init_video_bitmap(mtmd_context * /*ctx*/, const uint8_t* /*buffer*/, size_t /*size*/){
453+
LOG_ERR("FFmpeg support is not enabled in this build, can not load video from buffer\n");
454+
return nullptr;
541455
}
456+
#endif
542457

543-
mtmd_bitmap* init_video_bitmap(mtmd_context * ctx, const uint8_t* buffer, size_t size){
458+
mtmd_bitmap* init_video_bitmap(mtmd_context * ctx, const std::string & path) {
544459
auto info = mtmd_video::VideoInfo{};
545-
if(!mtmd_video::get_video_info(buffer, size, info)) {
546-
LOG_ERR("Unable to get video info from buffer\n");
460+
461+
if(mtmd_helper::is_dir(path)){
462+
get_video_info_from_dir(path, info);
463+
const auto opts = get_video_sample_options(info);
464+
return load_frames_from_dir(ctx, path, opts);
465+
}
466+
467+
// handle file otherwise
468+
469+
#ifdef MTMD_WITH_FFMPEG
470+
AVFormatContext* fmt = nullptr;
471+
472+
if (avformat_open_input(&fmt, path.c_str(), nullptr, nullptr) < 0){
473+
LOG_ERR("Unable to open video from path: %s\n", path.c_str());
474+
if(fmt) avformat_close_input(&fmt);
475+
return nullptr;
476+
}
477+
if(!get_video_info_from_format_ctx(fmt, info)) {
478+
LOG_ERR("Unable to get video info from path: %s\n", path.c_str());
479+
if(fmt) avformat_close_input(&fmt);
547480
return nullptr;
548481
}
549482

550483
const auto opts = get_video_sample_options(info);
551484

552485
std::vector<DecodedFrameRGBA> frames;
553-
if(!decode_video_ffmpeg_to_rgba(buffer, size, frames, opts.max_frames, std::max(1u, opts.stride))){
554-
LOG_ERR("Unable to decode video from buffer\n");
486+
if(!decode_video_ffmpeg_to_rgba_from_format_ctx(fmt, frames, opts)){
487+
LOG_ERR("Unable to decode video from path: %s\n", path.c_str());
488+
if(fmt) avformat_close_input(&fmt);
555489
return nullptr;
556490
}
557491

558-
return convert_frames_to_bitmap(ctx, frames);
492+
auto * res = convert_frames_to_bitmap(ctx, frames);
493+
if(fmt) avformat_close_input(&fmt);
494+
return res;
495+
#else
496+
LOG_ERR("FFmpeg support is not enabled in this build, can not load video from file\n");
497+
return nullptr;
498+
#endif
559499
}
560500

561501
} // namespace mtmd_video

tools/mtmd/mtmd-video.h

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,6 @@ bool is_video_file(const std::string & path);
3232
// Notice: audio containers may also be recognized as valid media
3333
bool is_video_buffer(const uint8_t *data, size_t size);
3434

35-
// get video info (fps, total frames) via FFmpeg (mp4/mov/mkv/avi/webm...).
36-
// Returns true on success, false otherwise.
37-
bool get_video_info(const std::string &path, VideoInfo &info);
38-
bool get_video_info(const uint8_t* buffer, size_t size, VideoInfo &info);
39-
4035
// Append frames loaded from a file or directory (auto-detect).
4136
// Returns a mtmd_bitmap containing all frames in RGB format.
4237
mtmd_bitmap* init_video_bitmap(mtmd_context * ctx,

0 commit comments

Comments
 (0)