@@ -25,12 +25,11 @@ bool is_video_file(const std::string & path){
2525 lower.rfind (" .webm" ) != std::string::npos;
2626}
2727
28- static bool get_video_info_from_dir (const std::string &path, VideoInfo &info){
28+ static void get_video_info_from_dir (const std::string &path, VideoInfo &info){
2929 info.fps = 1 ; // do not care
3030 std::vector<std::string> files;
3131 mtmd_helper::list_files (path, files, true ); // recursive
3232 info.total_frames = files.size ();
33- return true ;
3433}
3534// untested
3635static mtmd_bitmap* load_frames_from_dir (mtmd_context * ctx,
@@ -80,6 +79,32 @@ struct DecodedFrameRGBA {
8079 std::vector<unsigned char > rgba; // size = width * height * 4
8180};
8281
82+ static mtmd_video::LoadVideoOptions get_video_sample_options (mtmd_video::VideoInfo info){
83+ mtmd_video::LoadVideoOptions opts;
84+ opts.max_frames = 32 ;
85+ opts.stride = 1 ;
86+ opts.recursive = false ;
87+
88+ /* MiniCPM-V normal-speed video frames sample method */
89+
90+ #ifdef MTMD_MAX_VIDEO_FRAMES_SMALL
91+ // set a small number of frames for fast test locally
92+ const int32_t minicpmv_max_video_frames = 4 ;
93+ #else
94+ const int32_t minicpmv_max_video_frames = 64 ;
95+ #endif
96+ opts.max_frames = minicpmv_max_video_frames;
97+ if (info.total_frames > minicpmv_max_video_frames) {
98+ // uniform sample
99+ opts.stride = (int )std::ceil ((double )info.total_frames / minicpmv_max_video_frames);
100+ } else {
101+ // 1 frame per second
102+ opts.stride = (info.fps > 1.0 ) ? (int )std::ceil (info.fps ) : 1 ;
103+ }
104+
105+ return opts;
106+ }
107+
83108// --- FFmpeg-based file decoding (optional) ---
84109
85110#ifdef MTMD_WITH_FFMPEG
@@ -165,11 +190,10 @@ static int64_t seek_packet(void* opaque, int64_t offset, int whence) {
165190
166191static bool create_format_context_from_buffer (const uint8_t * buffer, size_t size,
167192 AVFormatContext*& fmt,
168- AVIOContext*& avio_ctx,
169- uint8_t *& avio_ctx_buffer) {
193+ AVIOContext*& avio_ctx) {
170194 fmt = nullptr ;
171195 avio_ctx = nullptr ;
172- avio_ctx_buffer = nullptr ;
196+ uint8_t * avio_ctx_buffer = nullptr ;
173197
174198 if (!buffer || size == 0 ) return false ;
175199
@@ -312,39 +336,13 @@ static bool get_video_info_from_format_ctx(AVFormatContext *fmt, VideoInfo &info
312336 return true ;
313337}
314338
315- // from buffer
316- bool get_video_info (const uint8_t * buffer, size_t size, VideoInfo &info) {
317- AVFormatContext* fmt = nullptr ;
318- AVIOContext* avio_ctx = nullptr ;
319- uint8_t * avio_ctx_buffer = nullptr ;
320-
321- GGML_ASSERT (create_format_context_from_buffer (buffer, size, fmt, avio_ctx, avio_ctx_buffer));
322- bool ok = get_video_info_from_format_ctx (fmt, info);
323- free_format_context_from_buffer (fmt, avio_ctx);
324- return ok;
325- }
326-
327- // from file
328- bool get_video_info (const std::string &path, VideoInfo &info) {
329- if (mtmd_helper::is_dir (path)) return get_video_info_from_dir (path, info);
330-
331- AVFormatContext* fmt = nullptr ;
332- if (avformat_open_input (&fmt, path.c_str (), nullptr , nullptr ) < 0 )
333- return false ;
334-
335- std::unique_ptr<AVFormatContext, void (*)(AVFormatContext*)> fmt_guard (fmt, [](AVFormatContext* f){
336- if (f) avformat_close_input (&f);
337- });
338-
339- return get_video_info_from_format_ctx (fmt, info);
340- }
341-
342339static bool decode_video_ffmpeg_to_rgba_from_format_ctx (
343340 AVFormatContext* fmt,
344341 std::vector<DecodedFrameRGBA>& frames,
345- int max_frames,
346- int stride)
342+ mtmd_video::LoadVideoOptions opts)
347343{
344+ const auto stride = opts.stride ;
345+ const auto max_frames = opts.max_frames ;
348346 if (!fmt || stride <= 0 || max_frames <= 0 ) return false ;
349347 if (avformat_find_stream_info (fmt, nullptr ) < 0 ) return false ;
350348 int vstream = av_find_best_stream (fmt, AVMEDIA_TYPE_VIDEO, -1 , -1 , nullptr , 0 );
@@ -364,8 +362,8 @@ static bool decode_video_ffmpeg_to_rgba_from_format_ctx(
364362 std::unique_ptr<AVPacket, void (*)(AVPacket*)> pkt_guard (pkt, [](AVPacket *p){ if (p) av_packet_free (&p); });
365363
366364 SwsContext * sws = nullptr ;
367- int idx = 0 ;
368- int taken = 0 ;
365+ uint32_t idx = 0 ;
366+ uint32_t taken = 0 ;
369367 while (av_read_frame (fmt, pkt) >= 0 ) {
370368 if (pkt->stream_index != vstream) { av_packet_unref (pkt); continue ; }
371369 if (avcodec_send_packet (ctx, pkt) < 0 ) { av_packet_unref (pkt); break ; }
@@ -394,77 +392,6 @@ static bool decode_video_ffmpeg_to_rgba_from_format_ctx(
394392 return taken > 0 ;
395393}
396394
397- // from file
398- static bool decode_video_ffmpeg_to_rgba (
399- const std::string& file,
400- std::vector<DecodedFrameRGBA>& frames,
401- int max_frames,
402- int stride)
403- {
404- AVFormatContext* fmt = nullptr ;
405- if (avformat_open_input (&fmt, file.c_str (), nullptr , nullptr ) < 0 )
406- return false ;
407-
408- std::unique_ptr<AVFormatContext, void (*)(AVFormatContext*)> fmt_guard (fmt, [](AVFormatContext* f){
409- if (f) avformat_close_input (&f);
410- });
411-
412- return decode_video_ffmpeg_to_rgba_from_format_ctx (fmt, frames, max_frames, stride);
413- }
414-
415- // from buffer
416- static bool decode_video_ffmpeg_to_rgba (
417- const uint8_t * buffer,
418- size_t size,
419- std::vector<DecodedFrameRGBA>& frames,
420- int max_frames,
421- int stride)
422- {
423- if (!buffer || size == 0 ) return false ;
424- AVFormatContext* fmt = nullptr ;
425- AVIOContext* avio_ctx = nullptr ;
426- uint8_t * avio_ctx_buffer = nullptr ;
427-
428- GGML_ASSERT (create_format_context_from_buffer (buffer, size, fmt, avio_ctx, avio_ctx_buffer));
429-
430- bool ok = decode_video_ffmpeg_to_rgba_from_format_ctx (fmt, frames, max_frames, stride);
431-
432- free_format_context_from_buffer (fmt, avio_ctx);
433- return ok;
434- }
435- #else
436- bool get_video_info (const std::string &path, VideoInfo &info){
437- if (mtmd_helper::is_dir (path)) return get_video_info_from_dir (path, info);
438- LOG_ERR (" FFmpeg support is not enabled in this build\n " );
439- return false ;
440- }
441- bool get_video_info (const uint8_t * /* buffer*/ , size_t /* size*/ , VideoInfo &/* info*/ ){
442- LOG_ERR (" FFmpeg support is not enabled in this build\n " );
443- return false ;
444- }
445- bool is_video_buffer (const uint8_t */*data*/, size_t /*size*/){
446- LOG_ERR (" FFmpeg support is not enabled in this build\n " );
447- return false ;
448- }
449- static bool decode_video_ffmpeg_to_rgba (
450- const std::string& /* file*/ ,
451- std::vector<DecodedFrameRGBA>& /* frames*/ ,
452- int /* max_frames*/ ,
453- int /* stride*/ )
454- {
455- return false ;
456- }
457- static bool decode_video_ffmpeg_to_rgba (
458- const uint8_t * /* buffer*/ ,
459- size_t /* size*/ ,
460- std::vector<DecodedFrameRGBA>& /* frames*/ ,
461- int /* max_frames*/ ,
462- int /* stride*/ )
463- {
464- return false ;
465- }
466- #endif
467-
468395static mtmd_bitmap* convert_frames_to_bitmap (mtmd_context * ctx, const std::vector<DecodedFrameRGBA>& decoded) {
469396 if (!ctx) return nullptr ;
470397 if (decoded.empty ()) return nullptr ;
@@ -492,70 +419,83 @@ static mtmd_bitmap* convert_frames_to_bitmap(mtmd_context * ctx, const std::vect
492419 return out_frames;
493420}
494421
495- static mtmd_video::LoadVideoOptions get_video_sample_options (mtmd_video::VideoInfo info){
496- mtmd_video::LoadVideoOptions opts;
497- opts.max_frames = 32 ;
498- opts.stride = 1 ;
499- opts.recursive = false ;
500-
501- /* MiniCPM-V normal-speed video frames sample method */
502-
503- #ifdef MTMD_MAX_VIDEO_FRAMES_SMALL
504- // set a small number of frames for fast test locally
505- const int32_t minicpmv_max_video_frames = 4 ;
506- #else
507- const int32_t minicpmv_max_video_frames = 64 ;
508- #endif
509- opts.max_frames = minicpmv_max_video_frames;
510- if (info.total_frames > minicpmv_max_video_frames) {
511- // uniform sample
512- opts.stride = (int )std::ceil ((double )info.total_frames / minicpmv_max_video_frames);
513- } else {
514- // 1 frame per second
515- opts.stride = (info.fps > 1.0 ) ? (int )std::ceil (info.fps ) : 1 ;
516- }
517-
518- return opts;
519- }
520-
521- mtmd_bitmap* init_video_bitmap (mtmd_context * ctx, const std::string & path) {
422+ mtmd_bitmap* init_video_bitmap (mtmd_context * ctx, const uint8_t * buffer, size_t size){
522423 auto info = mtmd_video::VideoInfo{};
523- if (!mtmd_video::get_video_info (path, info)) {
524- LOG_ERR (" Unable to get video info from path: %s\n " , path.c_str ());
424+ AVFormatContext* fmt = nullptr ;
425+ AVIOContext* avio_ctx = nullptr ;
426+ GGML_ASSERT (create_format_context_from_buffer (buffer, size, fmt, avio_ctx));
427+
428+ if (!get_video_info_from_format_ctx (fmt, info)) {
429+ LOG_ERR (" Unable to get video info from buffer\n " );
430+ free_format_context_from_buffer (fmt, avio_ctx);
525431 return nullptr ;
526432 }
527433
528434 const auto opts = get_video_sample_options (info);
529435
530- if (mtmd_helper::is_dir (path)) {
531- return load_frames_from_dir (ctx, path, opts);
532- }
533-
534436 std::vector<DecodedFrameRGBA> frames;
535- if (!decode_video_ffmpeg_to_rgba (path, frames, opts.max_frames , std::max (1u , opts.stride ))){
536- LOG_ERR (" Unable to decode video from path: %s\n " , path.c_str ());
437+ if (!decode_video_ffmpeg_to_rgba_from_format_ctx (fmt, frames, opts)){
438+ LOG_ERR (" Unable to decode video from buffer\n " );
439+ free_format_context_from_buffer (fmt, avio_ctx);
537440 return nullptr ;
538441 }
539442
540- return convert_frames_to_bitmap (ctx, frames);
443+ auto * res = convert_frames_to_bitmap (ctx, frames);
444+ free_format_context_from_buffer (fmt, avio_ctx);
445+ return res;
446+ }
447+ #else
448+ bool is_video_buffer (const uint8_t */*data*/, size_t /*size*/){
449+ LOG_WRN (" FFmpeg support is not enabled in this build, can not check it\n " );
450+ return false ;
451+ }
452+ mtmd_bitmap* init_video_bitmap (mtmd_context * /* ctx*/ , const uint8_t * /* buffer*/ , size_t /* size*/ ){
453+ LOG_ERR (" FFmpeg support is not enabled in this build, can not load video from buffer\n " );
454+ return nullptr ;
541455}
456+ #endif
542457
543- mtmd_bitmap* init_video_bitmap (mtmd_context * ctx, const uint8_t * buffer, size_t size) {
458+ mtmd_bitmap* init_video_bitmap (mtmd_context * ctx, const std::string & path) {
544459 auto info = mtmd_video::VideoInfo{};
545- if (!mtmd_video::get_video_info (buffer, size, info)) {
546- LOG_ERR (" Unable to get video info from buffer\n " );
460+
461+ if (mtmd_helper::is_dir (path)){
462+ get_video_info_from_dir (path, info);
463+ const auto opts = get_video_sample_options (info);
464+ return load_frames_from_dir (ctx, path, opts);
465+ }
466+
467+ // handle file otherwise
468+
469+ #ifdef MTMD_WITH_FFMPEG
470+ AVFormatContext* fmt = nullptr ;
471+
472+ if (avformat_open_input (&fmt, path.c_str (), nullptr , nullptr ) < 0 ){
473+ LOG_ERR (" Unable to open video from path: %s\n " , path.c_str ());
474+ if (fmt) avformat_close_input (&fmt);
475+ return nullptr ;
476+ }
477+ if (!get_video_info_from_format_ctx (fmt, info)) {
478+ LOG_ERR (" Unable to get video info from path: %s\n " , path.c_str ());
479+ if (fmt) avformat_close_input (&fmt);
547480 return nullptr ;
548481 }
549482
550483 const auto opts = get_video_sample_options (info);
551484
552485 std::vector<DecodedFrameRGBA> frames;
553- if (!decode_video_ffmpeg_to_rgba (buffer, size, frames, opts.max_frames , std::max (1u , opts.stride ))){
554- LOG_ERR (" Unable to decode video from buffer\n " );
486+ if (!decode_video_ffmpeg_to_rgba_from_format_ctx (fmt, frames, opts)){
487+ LOG_ERR (" Unable to decode video from path: %s\n " , path.c_str ());
488+ if (fmt) avformat_close_input (&fmt);
555489 return nullptr ;
556490 }
557491
558- return convert_frames_to_bitmap (ctx, frames);
492+ auto * res = convert_frames_to_bitmap (ctx, frames);
493+ if (fmt) avformat_close_input (&fmt);
494+ return res;
495+ #else
496+ LOG_ERR (" FFmpeg support is not enabled in this build, can not load video from file\n " );
497+ return nullptr ;
498+ #endif
559499}
560500
561501} // namespace mtmd_video
0 commit comments