From 6f6da8b9c521c233227e6ec6ded7332b8fb4bd7a Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 2 Nov 2025 16:38:44 +0100 Subject: [PATCH 1/2] initial impl of the video api --- tools/mtmd/mtmd-helper.cpp | 3 ++ tools/mtmd/mtmd-helper.h | 4 ++- tools/mtmd/mtmd.cpp | 69 ++++++++++++++++++++++++++++++++++---- tools/mtmd/mtmd.h | 16 +++++++++ 4 files changed, 84 insertions(+), 8 deletions(-) diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp index 686f42f3960fe..c10741aca64d9 100644 --- a/tools/mtmd/mtmd-helper.cpp +++ b/tools/mtmd/mtmd-helper.cpp @@ -458,3 +458,6 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size()); } + + +// TODO: implement video support here diff --git a/tools/mtmd/mtmd-helper.h b/tools/mtmd/mtmd-helper.h index 5c0edc6937eee..dce2353d60a7d 100644 --- a/tools/mtmd/mtmd-helper.h +++ b/tools/mtmd/mtmd-helper.h @@ -21,7 +21,8 @@ extern "C" { // // helper function to construct a mtmd_bitmap from a file -// it calls mtmd_helper_bitmap_init_from_buf() internally +// for image and audio, it calls mtmd_helper_bitmap_init_from_buf() internally +// for video, it reads and decodes and streams individual image frames to a bitmap // returns nullptr on failure // this function is thread-safe MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname); @@ -30,6 +31,7 @@ MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, con // supported formats: // image: formats supported by stb_image: jpg, png, bmp, gif, etc. // audio: formats supported by miniaudio: wav, mp3, flac +// video: formats supported by ffmpeg: mp4, mov, avi, mkv, etc. // note: audio files will be auto-detected based on magic bytes // returns nullptr on failure // this function is thread-safe diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 196641dd95ef4..b511a0fc56ded 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -22,21 +22,29 @@ #include #include +enum mtmd_bitmap_type { + MTMD_BITMAP_TYPE_IMAGE, + MTMD_BITMAP_TYPE_AUDIO, + MTMD_BITMAP_TYPE_VIDEO, +}; + // represents raw image data, layout is RGBRGBRGB... // length of data must be nx * ny * 3 struct mtmd_bitmap { uint32_t nx; uint32_t ny; + uint32_t nz = 1; // number of frames for video, 1 for image/audio std::vector data; std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking - bool is_audio = false; // true if the bitmap is audio + bool type = MTMD_BITMAP_TYPE_IMAGE; }; struct mtmd_image_tokens { uint32_t nx; // number of tokens in x direction uint32_t ny; // number of tokens in y direction + uint32_t nz = 1; // number of tokens in z direction (aka time for video), always 1 for image bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position) - uint32_t n_tokens() const { return nx * ny; } + uint32_t n_tokens() const { return nx * ny * nz; } clip_image_f32_batch batch_f32; // preprocessed image patches std::string id; // optional user-defined ID, useful for KV cache tracking @@ -44,6 +52,7 @@ struct mtmd_image_tokens { return mtmd_image_tokens{ nx, ny, + nz, use_mrope_pos, batch_f32.clone(), id @@ -492,7 +501,7 @@ struct mtmd_tokenizer { } int32_t add_media(const mtmd_bitmap * bitmap) { - if (!bitmap->is_audio) { + if (bitmap->type == MTMD_BITMAP_TYPE_IMAGE) { // handle image if (!ctx->ctx_v) { @@ -616,7 +625,7 @@ struct mtmd_tokenizer { add_text(ctx->img_end, true); // add image end token } - } else { + } else if (bitmap->type == MTMD_BITMAP_TYPE_AUDIO) { // handle audio if (!ctx->ctx_a) { @@ -676,6 +685,12 @@ struct mtmd_tokenizer { if (!ctx->aud_end.empty()) { add_text(ctx->aud_end, true); // add audio end token } + } else if (bitmap->type == MTMD_BITMAP_TYPE_VIDEO) { + GGML_ABORT("TODO: implement this"); + return 0; + } else { + LOG_ERR("%s: error: unknown bitmap type %d\n", __func__, (int)bitmap->type); + return 1; } return 0; @@ -858,7 +873,8 @@ mtmd_bitmap * mtmd_bitmap_init(uint32_t nx, uint32_t ny, const unsigned char * data) { mtmd_bitmap * bitmap = new mtmd_bitmap; - bitmap->nx = nx; + bitmap->type = MTMD_BITMAP_TYPE_IMAGE; + bitmap->ny = ny; bitmap->ny = ny; size_t data_size = (size_t)nx * ny * 3; bitmap->data.resize(data_size); @@ -869,15 +885,32 @@ mtmd_bitmap * mtmd_bitmap_init(uint32_t nx, mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, const float * data) { mtmd_bitmap * bitmap = new mtmd_bitmap; + bitmap->type = MTMD_BITMAP_TYPE_AUDIO; bitmap->nx = n_samples; bitmap->ny = 1; - bitmap->is_audio = true; size_t data_size = n_samples * sizeof(float); bitmap->data.resize(data_size); std::memcpy(bitmap->data.data(), data, data_size); return bitmap; } +mtmd_bitmap * mtmd_bitmap_init_from_video(uint32_t nx, + uint32_t ny, + uint32_t nframes, + const unsigned char * data) { + mtmd_bitmap * bitmap = new mtmd_bitmap; + bitmap->type = MTMD_BITMAP_TYPE_VIDEO; + bitmap->nx = nx; + bitmap->ny = ny; + bitmap->nz = nframes; + size_t data_size = (size_t)nx * ny * 3 * nframes; + bitmap->data.resize(data_size); + if (data != nullptr) { + std::memcpy(bitmap->data.data(), data, data_size); + } + return bitmap; +} + uint32_t mtmd_bitmap_get_nx(const mtmd_bitmap * bitmap) { return bitmap->nx; } @@ -894,8 +927,16 @@ size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap) { return bitmap->data.size(); } +bool mtmd_bitmap_is_image(const mtmd_bitmap * bitmap) { + return bitmap->type == MTMD_BITMAP_TYPE_IMAGE; +} + bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap) { - return bitmap->is_audio; + return bitmap->type == MTMD_BITMAP_TYPE_AUDIO; +} + +bool mtmd_bitmap_is_video(const mtmd_bitmap * bitmap) { + return bitmap->type == MTMD_BITMAP_TYPE_VIDEO; } const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap) { @@ -916,6 +957,20 @@ void mtmd_bitmap_free(mtmd_bitmap * bitmap) { } } +bool mtmd_bitmap_set_frame(mtmd_bitmap * bitmap, size_t i, const unsigned char * data) { + if (bitmap->type != MTMD_BITMAP_TYPE_VIDEO) { + LOG_ERR("%s: error: bitmap is not a video\n", __func__); + return false; + } + if (i >= (size_t)bitmap->nz) { + LOG_ERR("%s: error: frame index %zu out of range (nz = %u)\n", __func__, i, bitmap->nz); + return false; + } + size_t frame_size = (size_t)bitmap->nx * bitmap->ny * 3; + std::memcpy(bitmap->data.data() + i * frame_size, data, frame_size); + return true; +} + // mtmd_input_chunks mtmd_input_chunks * mtmd_input_chunks_init() { diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h index 0b5d2ba0c7634..e9add004a9c8f 100644 --- a/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h @@ -112,6 +112,11 @@ MTMD_API bool mtmd_support_audio(mtmd_context * ctx); // return -1 if audio is not supported MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx); +// get the video frame rate in FPS that the model expects +// return -1 if video is not supported +// TODO: implement this +MTMD_API int mtmd_get_video_fps(mtmd_context * ctx); + // mtmd_bitmap // // if bitmap is image: @@ -120,14 +125,25 @@ MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx); // if bitmap is audio: // length of data must be n_samples * sizeof(float) // the data is in float format (PCM F32) +// if bitmap is video: +// length of data must be nx * ny * nframes * 3 +// the data is in RGBRGBRGB... format for each frame, frames are stored sequentially +// if data is nullptr, an empty bitmap is created (useful for streaming use case) MTMD_API mtmd_bitmap * mtmd_bitmap_init (uint32_t nx, uint32_t ny, const unsigned char * data); MTMD_API mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, const float * data); +MTMD_API mtmd_bitmap * mtmd_bitmap_init_from_video(uint32_t nx, uint32_t ny, uint32_t nframes, const unsigned char * data); MTMD_API uint32_t mtmd_bitmap_get_nx (const mtmd_bitmap * bitmap); MTMD_API uint32_t mtmd_bitmap_get_ny (const mtmd_bitmap * bitmap); MTMD_API const unsigned char * mtmd_bitmap_get_data (const mtmd_bitmap * bitmap); MTMD_API size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap); + +MTMD_API bool mtmd_bitmap_is_image (const mtmd_bitmap * bitmap); MTMD_API bool mtmd_bitmap_is_audio (const mtmd_bitmap * bitmap); +MTMD_API bool mtmd_bitmap_is_video (const mtmd_bitmap * bitmap); MTMD_API void mtmd_bitmap_free (mtmd_bitmap * bitmap); +// set single frame i of the video bitmap +// return false if bitmap is not video or i is out of range +MTMD_API bool mtmd_bitmap_set_frame (mtmd_bitmap * bitmap, size_t i, const unsigned char * data); // bitmap ID is optional, but useful for KV cache tracking // these getters/setters are dedicated functions, so you can for example calculate the hash of the image based on mtmd_bitmap_get_data() MTMD_API const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap); From 1d9026f52e223a7c5a7a2b5473731bb8afe9f418 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 2 Nov 2025 19:51:35 +0100 Subject: [PATCH 2/2] add mtmd_helper_video --- tools/mtmd/CMakeLists.txt | 17 +++ tools/mtmd/mtmd-helper.cpp | 218 ++++++++++++++++++++++++++++++++++++- tools/mtmd/mtmd-helper.h | 2 +- tools/mtmd/mtmd.cpp | 7 +- 4 files changed, 237 insertions(+), 7 deletions(-) diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt index 2381012a0d02f..bd964129757b0 100644 --- a/tools/mtmd/CMakeLists.txt +++ b/tools/mtmd/CMakeLists.txt @@ -60,3 +60,20 @@ if(LLAMA_TOOLS_INSTALL) endif() target_link_libraries (${TARGET} PRIVATE common mtmd Threads::Threads) target_compile_features(${TARGET} PRIVATE cxx_std_17) + + +option(MTMD_HELPER_VIDEO "Enable FFmpeg-based video decoding in mtmd-helper" OFF) +if (MTMD_HELPER_VIDEO) + find_package(PkgConfig QUIET) + if (PKG_CONFIG_FOUND) + pkg_check_modules(FFMPEG QUIET IMPORTED_TARGET libavformat libavcodec libswscale libavutil) + if (FFMPEG_FOUND) + target_link_libraries(mtmd PRIVATE PkgConfig::FFMPEG) + target_compile_definitions(mtmd PRIVATE MTMD_HELPER_VIDEO) + else() + message(FATAL_ERROR "FFmpeg not found via pkg-config (required by MTMD_HELPER_VIDEO)") + endif() + else() + message(FATAL_ERROR "pkg-config not found (required by MTMD_HELPER_VIDEO)") + endif() +endif() diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp index c10741aca64d9..b2ddbd7f9fda2 100644 --- a/tools/mtmd/mtmd-helper.cpp +++ b/tools/mtmd/mtmd-helper.cpp @@ -407,6 +407,7 @@ static bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int } // namespace audio_helpers mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len) { + // TODO: support loading video files if (audio_helpers::is_audio_file((const char *)buf, len)) { std::vector pcmf32; int bitrate = mtmd_get_audio_bitrate(ctx); @@ -436,7 +437,221 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigne return result; } + +// internal utils for video loading and decoding +#ifdef MTMD_HELPER_VIDEO +extern "C" { +#include +#include +#include +#include +} +struct mtmd_helper_video_ctx { +public: + mtmd_helper_video_ctx() {} + + // returns true if file is a valid video file + bool open_video_file(const char * fname) { + if (avformat_open_input(&fmt_ctx, fname, nullptr, nullptr) < 0) { + return false; + } + + if (avformat_find_stream_info(fmt_ctx, nullptr) < 0) { + return false; + } + + bool has_video = false; + for (unsigned int i = 0; i < fmt_ctx->nb_streams; i++) { + if (fmt_ctx->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_VIDEO) { + has_video = true; + break; + } + } + + return has_video; + } + + mtmd_bitmap * read_video_file() { + int video_stream_idx = -1; + for (unsigned int i = 0; i < fmt_ctx->nb_streams; i++) { + if (fmt_ctx->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_VIDEO) { + video_stream_idx = i; + break; + } + } + + if (video_stream_idx == -1) { + LOG_ERR("%s: Could not find video stream in the input file\n", __func__); + return nullptr; + } + + codec_par = fmt_ctx->streams[video_stream_idx]->codecpar; + const AVCodec * codec = avcodec_find_decoder(codec_par->codec_id); + if (!codec) { + LOG_ERR("%s: Could not find decoder for codec id %d\n", __func__, codec_par->codec_id); + return nullptr; + } + + codec_ctx = avcodec_alloc_context3(codec); + if (!codec_ctx) { + LOG_ERR("%s: Could not allocate codec context\n", __func__); + return nullptr; + } + + if (avcodec_parameters_to_context(codec_ctx, codec_par) < 0) { + LOG_ERR("%s: Could not copy codec parameters to codec context\n", __func__); + return nullptr; + } + + if (avcodec_open2(codec_ctx, codec, nullptr) < 0) { + LOG_ERR("%s: Could not open codec\n", __func__); + return nullptr; + } + + // prepare reading frames + frame = av_frame_alloc(); + rgb_frame = av_frame_alloc(); + if (!frame || !rgb_frame) { + LOG_ERR("%s: Could not allocate frame\n", __func__); + return nullptr; + } + + int width = codec_ctx->width; + int height = codec_ctx->height; + AVPixelFormat dst_fmt = AV_PIX_FMT_RGB24; + + // allocate output bitmap + int frame_count = get_frame_count(video_stream_idx); + if (frame_count <= 0) { + LOG_ERR("%s: Could not get valid frame count from video stream\n", __func__); + return nullptr; + } + alloc_bitmap(width, height, frame_count); + + // allocate buffer for RGB frame + buffer = (uint8_t *)av_malloc(av_image_get_buffer_size(dst_fmt, width, height, 32)); + av_image_fill_arrays(rgb_frame->data, rgb_frame->linesize, buffer, dst_fmt, width, height, 32); + + sws_ctx = sws_getContext(width, height, codec_ctx->pix_fmt, + width, height, dst_fmt, + SWS_BILINEAR, nullptr, nullptr, nullptr); + if (!sws_ctx) { + LOG_ERR("%s: Could not initialize sws context\n", __func__); + return nullptr; + } + + // read frame per packet + int frame_idx = 0; + pkt = av_packet_alloc(); + while (av_read_frame(fmt_ctx, pkt) >= 0) { + if (pkt->stream_index == video_stream_idx) { + int ret = avcodec_send_packet(codec_ctx, pkt); + if (ret < 0) break; + + while (ret >= 0) { + ret = avcodec_receive_frame(codec_ctx, frame); + if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) break; + if (ret < 0) break; + + sws_scale(sws_ctx, frame->data, frame->linesize, 0, height, + rgb_frame->data, rgb_frame->linesize); + + add_frame_to_bitmap(rgb_frame->data[0], frame_idx++); + } + } + av_packet_unref(pkt); + } + + // flush remaining frames + avcodec_send_packet(codec_ctx, nullptr); + int ret; + while (true) { + ret = avcodec_receive_frame(codec_ctx, frame); + if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) break; + if (ret < 0) break; + + sws_scale(sws_ctx, frame->data, frame->linesize, 0, height, + rgb_frame->data, rgb_frame->linesize); + + add_frame_to_bitmap(rgb_frame->data[0], frame_idx++); + } + + // edge case: image is misrecognized as video with 1 frame + if (frame_idx == 1) { + LOG_INF("%s: only 1 frame decoded, treating as still image\n", __func__); + return nullptr; + } + + LOG_INF("%s: video loaded, width: %d, height: %d, frames: %d\n", __func__, width, height, frame_idx); + + success = true; + return bitmap; + } + + ~mtmd_helper_video_ctx() { + av_packet_free(&pkt); + sws_freeContext(sws_ctx); + av_freep(&buffer); + av_frame_free(&rgb_frame); + av_frame_free(&frame); + avcodec_free_context(&codec_ctx); + avformat_close_input(&fmt_ctx); + if (!success && bitmap) { + mtmd_bitmap_free(bitmap); + } + } +private: + int get_frame_count(int video_stream_idx) { + if (!fmt_ctx || !codec_ctx) { + return 0; + } + return fmt_ctx->streams[video_stream_idx]->nb_frames; + } + + void add_frame_to_bitmap(const uint8_t * data, int frame_idx) { + mtmd_bitmap_set_frame(bitmap, frame_idx, data); + } + + void alloc_bitmap(int nx, int ny, int nframes) { + GGML_ASSERT(!bitmap); + bitmap = mtmd_bitmap_init_from_video(nx, ny, nframes, nullptr); + if (!bitmap) { + LOG_ERR("%s: failed to initialize video bitmap\n", __func__); + return; + } + } + + AVFormatContext * fmt_ctx = nullptr; + AVCodecParameters * codec_par = nullptr; + AVCodecContext * codec_ctx = nullptr; + SwsContext * sws_ctx = nullptr; + uint8_t * buffer = nullptr; + AVFrame * frame = nullptr; + AVFrame * rgb_frame = nullptr; + AVPacket * pkt = nullptr; + mtmd_bitmap * bitmap = nullptr; + bool success = false; +}; +#endif // MTMD_HELPER_VIDEO + mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname) { +#ifdef MTMD_HELPER_VIDEO + // first, try to read the file as video + // TODO: implement a function to test if the file is a video file before loading it + mtmd_helper_video_ctx video_ctx; + if (video_ctx.open_video_file(fname)) { + mtmd_bitmap * bitmap = video_ctx.read_video_file(); + if (bitmap) { + return bitmap; + } else { + LOG_ERR("%s: failed to load file as video, trying as image/audio\n", __func__); + } + } else { + LOG_INF("%s: file %s is not a video file, trying as image/audio\n", __func__, fname); + } +#endif // MTMD_HELPER_VIDEO + + // otherwise, read the file into memory buffer std::vector buf; FILE * f = fopen(fname, "rb"); if (!f) { @@ -458,6 +673,3 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size()); } - - -// TODO: implement video support here diff --git a/tools/mtmd/mtmd-helper.h b/tools/mtmd/mtmd-helper.h index dce2353d60a7d..de14ec795ae7a 100644 --- a/tools/mtmd/mtmd-helper.h +++ b/tools/mtmd/mtmd-helper.h @@ -31,8 +31,8 @@ MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, con // supported formats: // image: formats supported by stb_image: jpg, png, bmp, gif, etc. // audio: formats supported by miniaudio: wav, mp3, flac -// video: formats supported by ffmpeg: mp4, mov, avi, mkv, etc. // note: audio files will be auto-detected based on magic bytes +// note (2): this function does NOT support video input yet // returns nullptr on failure // this function is thread-safe MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len); diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index b511a0fc56ded..1032db9622bcd 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -36,7 +36,7 @@ struct mtmd_bitmap { uint32_t nz = 1; // number of frames for video, 1 for image/audio std::vector data; std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking - bool type = MTMD_BITMAP_TYPE_IMAGE; + mtmd_bitmap_type type = MTMD_BITMAP_TYPE_IMAGE; }; struct mtmd_image_tokens { @@ -687,7 +687,6 @@ struct mtmd_tokenizer { } } else if (bitmap->type == MTMD_BITMAP_TYPE_VIDEO) { GGML_ABORT("TODO: implement this"); - return 0; } else { LOG_ERR("%s: error: unknown bitmap type %d\n", __func__, (int)bitmap->type); return 1; @@ -874,8 +873,9 @@ mtmd_bitmap * mtmd_bitmap_init(uint32_t nx, const unsigned char * data) { mtmd_bitmap * bitmap = new mtmd_bitmap; bitmap->type = MTMD_BITMAP_TYPE_IMAGE; + bitmap->nx = nx; bitmap->ny = ny; - bitmap->ny = ny; + bitmap->nz = 1; size_t data_size = (size_t)nx * ny * 3; bitmap->data.resize(data_size); std::memcpy(bitmap->data.data(), data, data_size); @@ -888,6 +888,7 @@ mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, bitmap->type = MTMD_BITMAP_TYPE_AUDIO; bitmap->nx = n_samples; bitmap->ny = 1; + bitmap->nz = 1; size_t data_size = n_samples * sizeof(float); bitmap->data.resize(data_size); std::memcpy(bitmap->data.data(), data, data_size);