diff --git a/.clang-tidy b/.clang-tidy index 803b8b46a32f3..41f5f5ce626a5 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -14,6 +14,8 @@ Checks: > -readability-uppercase-literal-suffix, -readability-simplify-boolean-expr, -readability-math-missing-parentheses, + -readability-braces-around-statements, + -readability-isolate-declaration, clang-analyzer-*, -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling, performance-*, diff --git a/common/arg.cpp b/common/arg.cpp index 4316917d74595..4bb04cc9902a1 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2768,6 +2768,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.image.emplace_back(value); } ).set_examples({LLAMA_EXAMPLE_MTMD})); + add_opt(common_arg( + {"--video"}, "PATH", + "path to a video file (requires FFmpeg at build time) or a directory of frames; can be repeated.\n", + [](common_params & params, const std::string & value) { + params.video.emplace_back(value); + } + ).set_examples({LLAMA_EXAMPLE_MTMD})); add_opt(common_arg( {"--image-min-tokens"}, "N", "minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)", diff --git a/common/common.h b/common/common.h index 78c568a7bc62e..37bc2e80e5b8f 100644 --- a/common/common.h +++ b/common/common.h @@ -406,6 +406,7 @@ struct common_params { bool mmproj_use_gpu = true; // use GPU for multimodal model bool no_mmproj = false; // explicitly disable multimodal model std::vector image; // path to image file(s) + std::vector video; // path to video file(s) or frame directories int image_min_tokens = -1; int image_max_tokens = -1; diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt index 2381012a0d02f..e135d4727f0a4 100644 --- a/tools/mtmd/CMakeLists.txt +++ b/tools/mtmd/CMakeLists.txt @@ -5,12 +5,14 @@ find_package(Threads REQUIRED) add_library(mtmd mtmd.cpp mtmd-audio.cpp + mtmd-video.cpp mtmd.h clip.cpp clip.h clip-impl.h mtmd-helper.cpp mtmd-helper.h + mtmd-video.h ) target_link_libraries (mtmd PUBLIC ggml llama) @@ -20,6 +22,23 @@ target_include_directories(mtmd PRIVATE ../..) target_include_directories(mtmd PRIVATE ../../vendor) target_compile_features (mtmd PRIVATE cxx_std_17) +# Optional FFmpeg support for video decoding +option(MTMD_WITH_FFMPEG "Enable FFmpeg-based video decoding in mtmd-video" OFF) +if (MTMD_WITH_FFMPEG) + find_package(PkgConfig QUIET) + if (PKG_CONFIG_FOUND) + pkg_check_modules(FFMPEG QUIET IMPORTED_TARGET libavformat libavcodec libswscale libavutil) + if (FFMPEG_FOUND) + target_link_libraries(mtmd PRIVATE PkgConfig::FFMPEG) + target_compile_definitions(mtmd PRIVATE MTMD_WITH_FFMPEG) + else() + message(WARNING "FFmpeg not found via pkg-config; MTMD_WITH_FFMPEG disabled") + endif() + else() + message(WARNING "pkg-config not found; MTMD_WITH_FFMPEG disabled") + endif() +endif() + if (BUILD_SHARED_LIBS) set_target_properties (mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON) target_compile_definitions(mtmd PRIVATE LLAMA_BUILD) @@ -29,6 +48,7 @@ endif() set(MTMD_PUBLIC_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/mtmd.h ${CMAKE_CURRENT_SOURCE_DIR}/mtmd-helper.h + ${CMAKE_CURRENT_SOURCE_DIR}/mtmd-video.h ) set_target_properties(mtmd diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 99775cb3e351c..ac7247dd08bdb 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -201,6 +201,7 @@ struct clip_hparams { // legacy bool has_llava_projector = false; int minicpmv_version = 0; + int minicpmv_max_slice_nums = 9; int32_t minicpmv_query_num = 0; // MiniCPM-V query number // custom value provided by user, can be undefined if not set @@ -3911,16 +3912,67 @@ struct llava_uhd { const bool has_slices = original_size.width > slice_size || original_size.height > slice_size; const bool has_pinpoints = !ctx->model.hparams.image_res_candidates.empty(); - if (!has_slices) { - // skip slicing logic - res.overview_size = clip_image_size{slice_size, slice_size}; - res.refined_size = clip_image_size{0, 0}; - res.grid_size = clip_image_size{0, 0}; + if (clip_is_minicpmv(ctx)) { + auto best_size = get_best_resize(original_size, slice_size, patch_size, !has_slices); + res.overview_size = best_size; + + { + const int max_slice_nums = ctx->model.hparams.minicpmv_max_slice_nums; + const float log_ratio = log((float)original_width / original_height); + const float ratio = (float)original_width * original_height / (slice_size * slice_size); + const int multiple = fmin(ceil(ratio), max_slice_nums); + + auto best_grid = get_best_grid(max_slice_nums, multiple, log_ratio); + auto refine_size = get_refine_size(original_size, best_grid, slice_size, patch_size, true); + res.grid_size = best_grid; + res.refined_size = refine_size; + + LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n", + __func__, original_width, original_height, + res.overview_size.width, res.overview_size.height, + res.refined_size.width, res.refined_size.height, + res.grid_size.width, res.grid_size.height); + + if (!has_slices || max_slice_nums == 0) { + return res; + } + + int width = refine_size.width; + int height = refine_size.height; + int grid_x = int(width / best_grid.width); + int grid_y = int(height / best_grid.height); + for (int patches_y = 0, ic = 0; + patches_y < refine_size.height && ic < best_grid.height; + patches_y += grid_y, ic += 1) { + for (int patches_x = 0, jc = 0; + patches_x < refine_size.width && jc < best_grid.width; + patches_x += grid_x, jc += 1) { + slice_coordinates slice; + slice.x = patches_x; + slice.y = patches_y; + slice.size.width = grid_x; + slice.size.height = grid_y; + res.slices.push_back(slice); + LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n", + __func__, (int)res.slices.size() - 1, + slice.x, slice.y, slice.size.width, slice.size.height); + } + } + } return res; } + else { + if (!has_slices) { + // skip slicing logic + res.overview_size = clip_image_size{slice_size, slice_size}; + res.refined_size = clip_image_size{0, 0}; + res.grid_size = clip_image_size{0, 0}; + + return res; + } - if (has_pinpoints) { + if (has_pinpoints) { // has pinpoints, use them to calculate the grid size (e.g. llava-1.6) auto refine_size = llava_uhd::select_best_resolution( original_size, @@ -3956,53 +4008,7 @@ struct llava_uhd { return res; } - - // no pinpoints, dynamically calculate the grid size (e.g. minicpmv) - - auto best_size = get_best_resize(original_size, slice_size, patch_size, !has_slices); - res.overview_size = best_size; - - { - const int max_slice_nums = 9; // TODO: this is only used by minicpmv, maybe remove it - const float log_ratio = log((float)original_width / original_height); - const float ratio = (float)original_width * original_height / (slice_size * slice_size); - const int multiple = fmin(ceil(ratio), max_slice_nums); - - auto best_grid = get_best_grid(max_slice_nums, multiple, log_ratio); - auto refine_size = get_refine_size(original_size, best_grid, slice_size, patch_size, true); - res.grid_size = best_grid; - res.refined_size = refine_size; - - LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n", - __func__, original_width, original_height, - res.overview_size.width, res.overview_size.height, - res.refined_size.width, res.refined_size.height, - res.grid_size.width, res.grid_size.height); - - int width = refine_size.width; - int height = refine_size.height; - int grid_x = int(width / best_grid.width); - int grid_y = int(height / best_grid.height); - for (int patches_y = 0, ic = 0; - patches_y < refine_size.height && ic < best_grid.height; - patches_y += grid_y, ic += 1) { - for (int patches_x = 0, jc = 0; - patches_x < refine_size.width && jc < best_grid.width; - patches_x += grid_x, jc += 1) { - slice_coordinates slice; - slice.x = patches_x; - slice.y = patches_y; - slice.size.width = grid_x; - slice.size.height = grid_y; - res.slices.push_back(slice); - LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n", - __func__, (int)res.slices.size() - 1, - slice.x, slice.y, slice.size.width, slice.size.height); - } - } } - - return res; } static std::vector slice_image(const clip_image_u8 * img, const slice_instructions & inst) { diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp index 3e19e95958a2f..4b31061ef0ab8 100644 --- a/tools/mtmd/mtmd-cli.cpp +++ b/tools/mtmd/mtmd-cli.cpp @@ -6,12 +6,15 @@ #include "ggml.h" #include "console.h" #include "chat.h" +#include "clip.h" #include "mtmd.h" #include "mtmd-helper.h" +#include "mtmd-video.h" #include #include #include +#include #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) #include @@ -157,8 +160,8 @@ struct mtmd_cli_context { ); } - bool load_media(const std::string & fname) { - mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str())); + bool load_media(const std::string & path) { + mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), path.c_str())); if (!bmp.ptr) { return false; } @@ -287,7 +290,7 @@ int main(int argc, char ** argv) { mtmd_cli_context ctx(params); LOG("%s: loading model: %s\n", __func__, params.model.path.c_str()); - bool is_single_turn = !params.prompt.empty() && !params.image.empty(); + bool is_single_turn = !params.prompt.empty() && (!params.image.empty() || !params.video.empty()); int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict; @@ -311,19 +314,34 @@ int main(int argc, char ** argv) { if (is_single_turn) { g_is_generating = true; - if (params.prompt.find(mtmd_default_marker()) == std::string::npos) { - for (size_t i = 0; i < params.image.size(); i++) { - params.prompt += mtmd_default_marker(); - } - } - common_chat_msg msg; - msg.role = "user"; - msg.content = params.prompt; + + // 1) load all media first + size_t n_loaded_media = 0; for (const auto & image : params.image) { if (!ctx.load_media(image)) { return 1; // error is already printed by libmtmd } + n_loaded_media += 1; + } + for (const auto & vpath : params.video) { + if (!ctx.load_media(vpath)) { + return 1; // error is already printed by libmtmd + } + n_loaded_media += 1; } + + // 2) build prompt content with correct number of markers + std::string prompt_content = params.prompt; + if (prompt_content.find(mtmd_default_marker()) == std::string::npos) { + for (size_t i = 0; i < n_loaded_media; i++) { + prompt_content += mtmd_default_marker(); + } + } + + // 3) run + common_chat_msg msg; + msg.role = "user"; + msg.content = prompt_content; if (eval_message(ctx, msg)) { return 1; } @@ -339,6 +357,9 @@ int main(int argc, char ** argv) { if (mtmd_support_audio(ctx.ctx_vision.get())) { LOG("\n /audio load an audio"); } + if (mtmd_support_vision(ctx.ctx_vision.get())) { + LOG("\n /video load a video"); + } LOG("\n /clear clear the chat history"); LOG("\n /quit or /exit exit the program"); LOG("\n"); @@ -370,14 +391,15 @@ int main(int argc, char ** argv) { g_is_generating = true; bool is_image = line == "/image" || line.find("/image ") == 0; bool is_audio = line == "/audio" || line.find("/audio ") == 0; - if (is_image || is_audio) { + bool is_video = line == "/video" || line.find("/video ") == 0; + if (is_image || is_audio || is_video) { if (line.size() < 8) { LOG_ERR("ERR: Missing media filename\n"); continue; } std::string media_path = line.substr(7); if (ctx.load_media(media_path)) { - LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : "audio"); + LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : (is_audio ? "audio" : "video")); content += mtmd_default_marker(); } // else, error is already printed by libmtmd diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp index 686f42f3960fe..bdb9cc82faf2f 100644 --- a/tools/mtmd/mtmd-helper.cpp +++ b/tools/mtmd/mtmd-helper.cpp @@ -5,12 +5,18 @@ # define NOMINMAX #endif #include +#else +#include +#include +#include #endif #include "mtmd.h" #include "mtmd-helper.h" #include "llama.h" +#include "mtmd-video.h" + #include #include #include @@ -421,6 +427,10 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigne return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data()); } + if(mtmd_video::is_video_buffer(buf, len)) { + return mtmd_video::init_video_bitmap(ctx, buf, len); + } + // otherwise, we assume it's an image mtmd_bitmap * result = nullptr; { @@ -436,25 +446,34 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigne return result; } -mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname) { - std::vector buf; - FILE * f = fopen(fname, "rb"); +mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * path) { + // Attention! A directory containing frames images is also considered a video + // so path which is a directory should be handled by mtmd_video::init_video_bitmap + // Besides, although we could read the file into memory and call mtmd_helper_bitmap_init_from_buf, + // but for video files, it's better to let ffmpeg read from file + if(mtmd_video::is_video_file(path)){ + return mtmd_video::init_video_bitmap(ctx, path); + } + + FILE * f = fopen(path, "rb"); if (!f) { - LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno)); + LOG_ERR("Unable to open path %s: %s\n", path, strerror(errno)); return nullptr; } fseek(f, 0, SEEK_END); long file_size = ftell(f); fseek(f, 0, SEEK_SET); - buf.resize(file_size); + auto * buf = new unsigned char[file_size]; - size_t n_read = fread(buf.data(), 1, file_size, f); + size_t n_read = fread(buf, 1, file_size, f); fclose(f); if (n_read != (size_t)file_size) { - LOG_ERR("Failed to read entire file %s", fname); + LOG_ERR("Failed to read entire path %s", path); return nullptr; } - return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size()); + auto * res = mtmd_helper_bitmap_init_from_buf(ctx, buf, file_size); + delete [] buf; + return res; } diff --git a/tools/mtmd/mtmd-helper.h b/tools/mtmd/mtmd-helper.h index 5c0edc6937eee..e9085e036da82 100644 --- a/tools/mtmd/mtmd-helper.h +++ b/tools/mtmd/mtmd-helper.h @@ -21,10 +21,11 @@ extern "C" { // // helper function to construct a mtmd_bitmap from a file -// it calls mtmd_helper_bitmap_init_from_buf() internally +// for image and audio, it calls mtmd_helper_bitmap_init_from_buf() internally +// for video, it it calls init_video_bitmap() to reads and decodes and streams individual image frames to a bitmap // returns nullptr on failure // this function is thread-safe -MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname); +MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * path); // helper function to construct a mtmd_bitmap from a buffer containing a file // supported formats: diff --git a/tools/mtmd/mtmd-video.cpp b/tools/mtmd/mtmd-video.cpp new file mode 100644 index 0000000000000..503d9058487c7 --- /dev/null +++ b/tools/mtmd/mtmd-video.cpp @@ -0,0 +1,652 @@ +#include +#if defined(_WIN32) +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +# define NOMINMAX +#endif +#include +#else +#include +#include +#include +#endif + +#include "mtmd-video.h" +#include "ggml.h" +#include "mtmd-helper.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define LOG_INF(...) fprintf(stdout, __VA_ARGS__) +#define LOG_WRN(...) fprintf(stdout, __VA_ARGS__) +#define LOG_ERR(...) fprintf(stderr, __VA_ARGS__) + +static bool is_image_file(const std::string & path) { + // check extension first + std::string lower = path; + std::transform(lower.begin(), lower.end(), lower.begin(), + [](unsigned char c){ return (char)std::tolower(c); }); + + bool has_ext = lower.rfind(".jpg") != std::string::npos || + lower.rfind(".jpeg") != std::string::npos || + lower.rfind(".png") != std::string::npos || + lower.rfind(".bmp") != std::string::npos || + lower.rfind(".gif") != std::string::npos || + lower.rfind(".webp") != std::string::npos; + + if (!has_ext) return false; + + // check magic bytes then + std::ifstream f(path, std::ios::binary); + if (!f.is_open()) return false; + + std::vector header(16); + f.read(reinterpret_cast(header.data()), header.size()); + size_t n = f.gcount(); + if (n < 4) return false; // too small to be an image + + // JPEG: FF D8 FF + if (n >= 3 && header[0] == 0xFF && header[1] == 0xD8 && header[2] == 0xFF) + return true; + + // PNG: 89 50 4E 47 0D 0A 1A 0A + if (n >= 8 && + header[0] == 0x89 && header[1] == 0x50 && + header[2] == 0x4E && header[3] == 0x47 && + header[4] == 0x0D && header[5] == 0x0A && + header[6] == 0x1A && header[7] == 0x0A) + return true; + + // BMP: 42 4D + if (n >= 2 && header[0] == 0x42 && header[1] == 0x4D) + return true; + + // GIF: "GIF87a" or "GIF89a" + if (n >= 6 && + header[0] == 'G' && header[1] == 'I' && header[2] == 'F' && + header[3] == '8' && + (header[4] == '7' || header[4] == '9') && + header[5] == 'a') + return true; + + // WEBP: "RIFF....WEBP" + if (n >= 12 && + header[0] == 'R' && header[1] == 'I' && header[2] == 'F' && header[3] == 'F' && + header[8] == 'W' && header[9] == 'E' && header[10] == 'B' && header[11] == 'P') + return true; + + return false; +} + +static bool is_dir(const std::string & path) { +#if defined(_WIN32) + DWORD attrs = GetFileAttributesA(path.c_str()); + return (attrs != INVALID_FILE_ATTRIBUTES) && (attrs & FILE_ATTRIBUTE_DIRECTORY); +#else + struct stat st; + if (stat(path.c_str(), &st) != 0) return false; + return S_ISDIR(st.st_mode); +#endif +} + +static void list_files(const std::string & dir, std::vector & out, bool recursive) { +#if defined(_WIN32) + std::string pattern = dir; + if (!pattern.empty() && pattern.back() != '/' && pattern.back() != '\\') pattern += "\\"; + pattern += "*"; + WIN32_FIND_DATAA ffd; + HANDLE hFind = FindFirstFileA(pattern.c_str(), &ffd); + if (hFind == INVALID_HANDLE_VALUE) return; + do { + std::string name = ffd.cFileName; + if (name == "." || name == "..") continue; + std::string path = dir; + if (!path.empty() && path.back() != '/' && path.back() != '\\') path += "\\"; + path += name; + if (ffd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) { + if (recursive) list_files(path, out, recursive); + } else { + out.push_back(path); + } + } while (FindNextFileA(hFind, &ffd) != 0); + FindClose(hFind); +#else + DIR * dp = opendir(dir.c_str()); + if (!dp) return; + struct dirent * de; + while ((de = readdir(dp)) != nullptr) { + std::string name = de->d_name; + if (name == "." || name == "..") continue; + std::string path = dir + "/" + name; + if (is_dir(path)) { + if (recursive) list_files(path, out, recursive); + } else { + out.push_back(path); + } + } + closedir(dp); +#endif +} + +namespace mtmd_video { + +bool is_video_file(const std::string & path){ + if(is_dir(path)) return true; // a directory containing frames images is also considered a video + auto lower = path; + std::transform(lower.begin(), lower.end(), lower.begin(), [](unsigned char c){ return (char)std::tolower(c); }); + return lower.rfind(".mp4") != std::string::npos || + lower.rfind(".mov") != std::string::npos || + lower.rfind(".mkv") != std::string::npos || + lower.rfind(".avi") != std::string::npos || + lower.rfind(".webm") != std::string::npos; +} + +static void get_video_info_from_dir(const std::string &path, VideoInfo &info){ + info.fps = 1; // do not care + std::vector files; + list_files(path, files, true); // recursive + info.total_frames = files.size(); +} +// untested +static mtmd_bitmap* load_frames_from_dir(mtmd_context * ctx, + const std::string & dir_path, + const LoadVideoOptions & opts) { + if (!ctx || dir_path.empty() || !is_dir(dir_path) || opts.max_frames < 1) { + return nullptr; + } + std::vector files; + list_files(dir_path, files, opts.recursive); + std::sort(files.begin(), files.end()); + + auto stride = std::max(1u, opts.stride); + size_t loaded = 0; + mtmd_bitmap* out_frames = nullptr; + const auto nframes = std::min(files.size() / stride, (size_t)opts.max_frames); + + uint32_t w=0, h=0; + for (size_t i = 0; i < files.size(); i++) { + if (i % stride != 0) continue; + const std::string & f = files[i]; + if (!is_image_file(f)) continue; + mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx, f.c_str())); + if (!bmp.ptr) continue; + if(loaded==0){ + w = bmp.nx(); + h = bmp.ny(); + out_frames = mtmd_bitmap_init_from_video(w, h, nframes, nullptr); + } + GGML_ASSERT(bmp.nx() == w && bmp.ny() == h); // all frames must have the same size + mtmd_bitmap_set_frame(out_frames, loaded, bmp.data()); + loaded++; + if (loaded >= nframes) break; + } + + return out_frames; +} + +// This Frames Queue is useful for streaming use case, +// This frame queue is designed for streaming scenarios, +// where an FFmpeg-related thread acts as the producer generating frames, +// and the main program (the LLM inference thread) acts as the consumer processing them. +struct DecodedFramesQueue { + int width; + int height; + std::queue> frames; // frame size = width * height * 3 +}; + +static mtmd_video::LoadVideoOptions get_video_sample_options(mtmd_video::VideoInfo info){ + mtmd_video::LoadVideoOptions opts; + opts.max_frames = 32; + opts.stride = 1; + opts.recursive = false; + + /* MiniCPM-V normal-speed video frames sample method */ + bool debug_use_small_frames_for_video = std::getenv("MTMD_MAX_VIDEO_FRAMES_SMALL") != nullptr; + int32_t minicpmv_max_video_frames = 64; + if(debug_use_small_frames_for_video){ + // set a small number of frames for fast test locally + minicpmv_max_video_frames = 4; + } + opts.max_frames = minicpmv_max_video_frames; + if(info.total_frames > minicpmv_max_video_frames) { + // uniform sample + opts.stride = (int)std::ceil((double)info.total_frames / minicpmv_max_video_frames); + } else { + // 1 frame per second + opts.stride = (info.fps > 1.0) ? (int)std::ceil(info.fps) : 1; + } + + return opts; +} + +// --- FFmpeg-based file decoding (optional) --- + +#ifdef MTMD_WITH_FFMPEG +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +extern "C" { +#include +#include +#include +#include +} +#pragma GCC diagnostic pop + +bool is_video_buffer(const uint8_t *data, size_t size){ + if (!data || size < 16) return false; // too short + + AVProbeData probe; + probe.filename = ""; + + // ffmpeg requires that the last AVPROBE_PADDING_SIZE bytes of the buffer must be 0 + auto * padded = new uint8_t[size + AVPROBE_PADDING_SIZE]; + memcpy(padded, data, size); + memset(padded + size, 0, AVPROBE_PADDING_SIZE); + probe.buf = padded; + probe.buf_size = (int)size; + + int score = 0; + const AVInputFormat *fmt = av_probe_input_format2(&probe, 1, &score); + + delete [] padded; + + if (!fmt || score < 25) return false; + if (fmt->flags & AVFMT_NOFILE) return false; + + static const char *image_formats[] = { + "jpeg_pipe", "png_pipe", "bmp_pipe", "gif_pipe", "webp_pipe", + "tiff_pipe", "image2", "image2pipe", "mjpeg" + }; + for (const auto* name : image_formats) + if (fmt->name && strstr(fmt->name, name)) + return false; + + return true; +} + +struct BufferData { + const uint8_t* base; + size_t size; + size_t pos; + BufferData(const uint8_t* b, size_t s) : base(b), size(s), pos(0) {} +}; + +static int read_packet(void* opaque, uint8_t* buf, int buf_size) { + BufferData* bd = static_cast(opaque); + if (!bd || !bd->base) return AVERROR(EIO); + if (bd->pos >= bd->size) return AVERROR_EOF; + size_t rem = bd->size - bd->pos; + int to_read = (int)(rem < (size_t)buf_size ? rem : (size_t)buf_size); + if (to_read == 0) return AVERROR_EOF; + memcpy(buf, bd->base + bd->pos, to_read); + bd->pos += to_read; + return to_read; +} + +static int64_t seek_packet(void* opaque, int64_t offset, int whence) { + BufferData* bd = static_cast(opaque); + if (!bd) return -1; + if (whence == AVSEEK_SIZE) return (int64_t)bd->size; + size_t newpos = bd->pos; + if (whence == SEEK_SET) { + if (offset < 0 || (size_t)offset > bd->size) return -1; + newpos = (size_t)offset; + } else if (whence == SEEK_CUR) { + if (offset < 0 && (size_t)(-offset) > bd->pos) return -1; + newpos = bd->pos + (size_t)offset; + if (newpos > bd->size) return -1; + } else if (whence == SEEK_END) { + if (offset > 0 || (size_t)(-offset) > bd->size) return -1; + newpos = bd->size + (size_t)offset; + } else return -1; + bd->pos = newpos; + return (int64_t)bd->pos; +} + +static bool create_format_context_from_buffer(const uint8_t* buffer, size_t size, + AVFormatContext*& fmt, + AVIOContext*& avio_ctx) { + fmt = nullptr; + avio_ctx = nullptr; + uint8_t* avio_ctx_buffer = nullptr; + + if (!buffer || size == 0) return false; + + // allocate BufferData + BufferData* bd = new (std::nothrow) BufferData(buffer, size); + if (!bd) return false; + + const int AVIO_BUF_SIZE = 4096; + avio_ctx_buffer = static_cast(av_malloc(AVIO_BUF_SIZE)); + if (!avio_ctx_buffer) { + delete bd; + return false; + } + + avio_ctx = avio_alloc_context( + avio_ctx_buffer, AVIO_BUF_SIZE, + 0, // read only + bd, + &read_packet, + nullptr, + &seek_packet + ); + + if (!avio_ctx) { + av_free(avio_ctx_buffer); + delete bd; + avio_ctx_buffer = nullptr; + return false; + } + + fmt = avformat_alloc_context(); + if (!fmt) { + // avio_context_free frees ctx->buffer but NOT opaque + if (avio_ctx->opaque) delete static_cast(avio_ctx->opaque); + avio_context_free(&avio_ctx); + avio_ctx_buffer = nullptr; + return false; + } + + fmt->pb = avio_ctx; + fmt->flags |= AVFMT_FLAG_CUSTOM_IO; + + // increase probing - optional but helpful for truncated/streamed files + AVDictionary* opts = nullptr; + av_dict_set(&opts, "probesize", "5000000", 0); + av_dict_set(&opts, "analyzeduration", "5000000", 0); + + int ret = avformat_open_input(&fmt, "stream", nullptr, &opts); + av_dict_free(&opts); + + if (ret < 0) { + // Clean up carefully + // If fmt exists and has pb, free pb and opaque appropriately + if (fmt) { + AVIOContext* pb = fmt->pb; + BufferData* bd_from_fmt = pb ? static_cast(pb->opaque) : nullptr; + avformat_free_context(fmt); + if (pb) { + delete bd_from_fmt; + avio_context_free(&pb); // frees pb->buffer + } + fmt = nullptr; + } else { + // fmt null: free avio_ctx and opaque + if (avio_ctx) { + if (avio_ctx->opaque) delete static_cast(avio_ctx->opaque); + avio_context_free(&avio_ctx); + avio_ctx = nullptr; + } + } + avio_ctx_buffer = nullptr; + return false; + } + + // success: avformat_open_input succeeded, fmt and pb are owned by caller, + // but opaque (BufferData) must be deleted by us later (avformat_close_input won't delete opaque). + return true; +} + +static void free_format_context_from_buffer(AVFormatContext* fmt, + AVIOContext* avio_ctx) { + if (fmt) { + // capture pb->opaque BEFORE closing + AVIOContext* pb = fmt->pb; + BufferData* bd = nullptr; + if (pb) bd = static_cast(pb->opaque); + + // this closes fmt and frees pb (and pb->buffer) + avformat_close_input(&fmt); + + // avformat_close_input does not free opaque, so free it now + if (bd) { + delete bd; + bd = nullptr; + } + // do NOT av_free(avio_ctx_buffer) here - it was freed with pb->buffer + return; + } + + // partial failure case: fmt is null but avio_ctx may still be valid + if (avio_ctx) { + BufferData* bd = static_cast(avio_ctx->opaque); + delete bd; + avio_context_free(&avio_ctx); // frees avio_ctx->buffer + // avio_ctx_buffer already freed by avio_context_free + return; + } +} + + +static bool get_video_info_from_format_ctx(AVFormatContext *fmt, VideoInfo &info) { + if (!fmt) return false; + + if (avformat_find_stream_info(fmt, nullptr) < 0) { + return false; + } + + // find video stream + int vstream = av_find_best_stream(fmt, AVMEDIA_TYPE_VIDEO, -1, -1, nullptr, 0); + if (vstream < 0) { + return false; + } + + AVStream *st = fmt->streams[vstream]; + + // get fps + if (st->avg_frame_rate.num > 0 && st->avg_frame_rate.den > 0){ + info.fps = av_q2d(st->avg_frame_rate); + }else if (st->r_frame_rate.num > 0 && st->r_frame_rate.den > 0){ + info.fps = av_q2d(st->r_frame_rate); + } + // get total frames + if (st->nb_frames > 0){ + info.total_frames = st->nb_frames; + }else if (fmt->duration > 0 && info.fps > 0.0){ + // estimate total frames if nb_frames is not available + info.total_frames = std::llround((fmt->duration / (double)AV_TIME_BASE) * info.fps); + } + + return true; +} + +static bool decode_video_ffmpeg_to_rgba_from_format_ctx( + AVFormatContext* fmt, + DecodedFramesQueue& out_q, + mtmd_video::LoadVideoOptions opts) +{ + const auto stride = opts.stride; + const auto max_frames = opts.max_frames; + if(!fmt || stride <= 0 || max_frames <= 0) return false; + // vstream checked before in get_video_info_from_format_ctx + int vstream = av_find_best_stream(fmt, AVMEDIA_TYPE_VIDEO, -1, -1, nullptr, 0); + if (vstream < 0) return false; + AVStream * st = fmt->streams[vstream]; + const AVCodec * dec = avcodec_find_decoder(st->codecpar->codec_id); + if (!dec){ + LOG_ERR("%s: Could not find decoder for codec id %d\n", __func__, st->codecpar->codec_id); + return false; + } + AVCodecContext * ctx = avcodec_alloc_context3(dec); + if (!ctx){ + LOG_ERR("%s: Could not allocate codec context\n", __func__); + return false; + } + std::unique_ptr ctx_guard(ctx, [](AVCodecContext *c){ if (c) avcodec_free_context(&c); }); + if (avcodec_parameters_to_context(ctx, st->codecpar) < 0){ + LOG_ERR("%s: Could not copy codec parameters to codec context\n", __func__); + return false; + } + if (avcodec_open2(ctx, dec, nullptr) < 0){ + LOG_ERR("%s: Could not open codec2\n", __func__); + return false; + } + + AVFrame * frame = av_frame_alloc(); + AVPacket * pkt = av_packet_alloc(); + std::unique_ptr frame_guard(frame, [](AVFrame *f){ if (f) av_frame_free(&f); }); + std::unique_ptr pkt_guard(pkt, [](AVPacket *p){ if (p) av_packet_free(&p); }); + + SwsContext * sws = nullptr; + uint32_t idx = 0; + uint32_t taken = 0; + const int w = ctx->width; + const int h = ctx->height; + + if(!out_q.frames.empty() && (out_q.height!=h || out_q.width!=w)){ + LOG_ERR("Different size frames remain in decoded frames queue"); + return false; + } + out_q.width = w; + out_q.height = h; + + while (av_read_frame(fmt, pkt) >= 0) { + if (pkt->stream_index != vstream) { av_packet_unref(pkt); continue; } + if (avcodec_send_packet(ctx, pkt) < 0) { av_packet_unref(pkt); break; } + av_packet_unref(pkt); + while (avcodec_receive_frame(ctx, frame) == 0) { + if (idx++ % stride != stride/2) { av_frame_unref(frame); continue; } + if (!sws) { + sws = sws_getContext(frame->width, frame->height, (AVPixelFormat)frame->format, + frame->width, frame->height, AV_PIX_FMT_RGB24, + SWS_BILINEAR, nullptr, nullptr, nullptr); + if (frame->format == AV_PIX_FMT_NONE) { av_frame_unref(frame); continue; } + if (!sws){ + LOG_ERR("%s: Could not initialize sws context\n", __func__); + av_frame_unref(frame); + return false; + } + } + + GGML_ASSERT(w==frame->width && h==frame->height); + + std::vector out; + out.resize((size_t)frame->width * frame->height * 3); + + uint8_t *dst_data[4] = { out.data(), nullptr, nullptr, nullptr }; + int dst_linesize[4] = { frame->width * 3, 0, 0, 0 }; + sws_scale(sws, frame->data, frame->linesize, 0, frame->height, dst_data, dst_linesize); + + out_q.frames.emplace(out); + taken++; + av_frame_unref(frame); + if (max_frames > 0 && taken >= max_frames) break; + } + } + if (sws) sws_freeContext(sws); + return taken > 0; +} + +static mtmd_bitmap* convert_frames_to_bitmap(mtmd_context * ctx, DecodedFramesQueue& decoded) { + if (!ctx) return nullptr; + if(decoded.frames.empty()) return nullptr; + const size_t nframes = decoded.frames.size(); + if(nframes < 1){ + return nullptr; + } + const int w = decoded.width; + const int h = decoded.height; + mtmd_bitmap* out_frames = mtmd_bitmap_init_from_video(uint32_t(w), uint32_t(h), uint32_t(nframes), nullptr); + for(size_t i=0;i +#include +#include +#include + +namespace mtmd_video { + +struct LoadVideoOptions { + uint32_t max_frames = 32; // maximum frames to load (<= 0 means no limit) + uint32_t stride = 1; // take every N-th frame + bool recursive = false;// scan subdirectories +}; + +struct VideoInfo { + double fps = 0.0; // frames per second + int64_t total_frames = 0; // total number of frames +}; + +// Check if a path is a video file based on its extension +bool is_video_file(const std::string & path); + +// Check if a buffer contains video file data via FFmpeg +// Notice: audio containers may also be recognized as valid media +bool is_video_buffer(const uint8_t *data, size_t size); + +// Append frames loaded from a file or directory (auto-detect). +// Returns a mtmd_bitmap containing all frames in RGB format. +mtmd_bitmap* init_video_bitmap(mtmd_context * ctx, + const std::string & path); +mtmd_bitmap* init_video_bitmap(mtmd_context * ctx, + const uint8_t* buffer, + size_t size); + +} // namespace mtmd_video + +#endif // MTMD_VIDEO_H + + diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 325f7ff995e36..e43a0007a5eda 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -16,19 +16,34 @@ #include #include +#include #include #include #include #include -// represents raw image data, layout is RGBRGBRGB... -// length of data must be nx * ny * 3 +enum mtmd_bitmap_type { + MTMD_BITMAP_TYPE_IMAGE, + MTMD_BITMAP_TYPE_AUDIO, + MTMD_BITMAP_TYPE_VIDEO, +}; + +// if bitmap is image: +// length of data must be nx * ny * 3 +// the data is in RGBRGBRGB... format +// if bitmap is audio: +// length of data must be n_samples * sizeof(float) +// the data is in float format (PCM F32) +// if bitmap is video: +// length of data must be nx * ny * nframes * 3 +// the data is in RGBRGBRGB... format for each frame, frames are stored sequentially struct mtmd_bitmap { + mtmd_bitmap_type type; uint32_t nx; - uint32_t ny; + uint32_t ny; // for audio, ny=1 + uint32_t nz; // for video: number of frames, always 1 for image and audio std::vector data; std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking - bool is_audio = false; // true if the bitmap is audio }; struct mtmd_image_tokens { @@ -87,6 +102,12 @@ enum mtmd_slice_tmpl { MTMD_SLICE_TMPL_IDEFICS3, }; +// video schema, used to tokenize video frames +enum mtmd_video_schema { + MTMD_VIDEO_SCHEMA_MINICPMV_2_6, // add overview image only (no slicing) + MTMD_VIDEO_SCHEMA_MINICPMV_4_5, +}; + const char * mtmd_default_marker() { return "<__media__>"; } @@ -134,6 +155,7 @@ struct mtmd_context { // for llava-uhd style models, we need special tokens in-between slices // minicpmv calls them "slices", llama 4 calls them "tiles" mtmd_slice_tmpl slice_tmpl = MTMD_SLICE_TMPL_NONE; + mtmd_video_schema video_sch = MTMD_VIDEO_SCHEMA_MINICPMV_2_6; std::vector tok_ov_img_start; // overview image std::vector tok_ov_img_end; // overview image std::vector tok_slices_start; // start of all slices @@ -505,58 +527,61 @@ struct mtmd_tokenizer { } } - int32_t add_media(const mtmd_bitmap * bitmap) { - if (!bitmap->is_audio) { - // handle image - - if (!ctx->ctx_v) { - LOG_ERR("%s: error: model does not support vision input\n", __func__); - return 2; - } + int32_t add_image(const mtmd_bitmap * bitmap, bool have_slices=true) { + if(mtmd_bitmap_is_audio(bitmap) || mtmd_bitmap_is_video(bitmap)){ + LOG_ERR("%s: error: bitmap is not an image\n", __func__); + return 2; + } + // handle image + if (!ctx->ctx_v) { + LOG_ERR("%s: error: model does not support vision input\n", __func__); + return 2; + } - if (!ctx->img_beg.empty()) { - add_text(ctx->img_beg, true); // add image begin token - } + if (!ctx->img_beg.empty()) { + add_text(ctx->img_beg, true); // add image begin token + } - // convert mtmd_bitmap to clip_image_u8 - clip_image_u8_ptr img_u8(clip_image_u8_init()); - img_u8->nx = bitmap->nx; - img_u8->ny = bitmap->ny; - img_u8->buf.resize(bitmap->data.size()); - std::memcpy(img_u8->buf.data(), bitmap->data.data(), img_u8->nx * img_u8->ny * 3); + // convert mtmd_bitmap to clip_image_u8 + clip_image_u8_ptr img_u8(clip_image_u8_init()); + img_u8->nx = bitmap->nx; + img_u8->ny = bitmap->ny; + img_u8->buf.resize(bitmap->data.size()); + std::memcpy(img_u8->buf.data(), bitmap->data.data(), img_u8->nx * img_u8->ny * 3); + + // preprocess image + clip_image_f32_batch batch_f32; + bool ok = clip_image_preprocess(ctx->ctx_v, img_u8.get(), &batch_f32); + if (!ok) { + LOG_ERR("Unable to preprocess image\n"); + return 2; + } - // preprocess image - clip_image_f32_batch batch_f32; - bool ok = clip_image_preprocess(ctx->ctx_v, img_u8.get(), &batch_f32); - if (!ok) { - LOG_ERR("Unable to preprocess image\n"); - return 2; + // handle llava-uhd style preprocessing + if ( + ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5 + || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6 + || ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4 + || ctx->slice_tmpl == MTMD_SLICE_TMPL_IDEFICS3 + ) { + const int n_col = batch_f32.grid_x; + const int n_row = batch_f32.grid_y; + // split batch into chunks of single images + // NOTE: batch_f32 will be invalidated after this call + auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmap->id); + GGML_ASSERT(chunks.size() > 0); + + auto ov_chunk = std::move(chunks.front()); + chunks.erase(chunks.begin()); + + // add overview image (first) + if (ctx->ov_img_first) { + add_text(ctx->tok_ov_img_start); + cur.entries.emplace_back(std::move(ov_chunk)); + add_text(ctx->tok_ov_img_end); } - - // handle llava-uhd style preprocessing - if ( - ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5 - || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6 - || ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4 - || ctx->slice_tmpl == MTMD_SLICE_TMPL_IDEFICS3 - ) { - const int n_col = batch_f32.grid_x; - const int n_row = batch_f32.grid_y; - // split batch into chunks of single images - // NOTE: batch_f32 will be invalidated after this call - auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmap->id); - GGML_ASSERT(chunks.size() > 0); - - auto ov_chunk = std::move(chunks.front()); - chunks.erase(chunks.begin()); - - // add overview image (first) - if (ctx->ov_img_first) { - add_text(ctx->tok_ov_img_start); - cur.entries.emplace_back(std::move(ov_chunk)); - add_text(ctx->tok_ov_img_end); - } - + + if(have_slices){ // add slices (or tiles) if (!chunks.empty()) { GGML_ASSERT((int)chunks.size() == n_row * n_col); @@ -585,52 +610,56 @@ struct mtmd_tokenizer { } add_text(ctx->tok_slices_end); } + } - // add overview image (last) - if (!ctx->ov_img_first) { - add_text(ctx->tok_ov_img_start); - cur.entries.emplace_back(std::move(ov_chunk)); - add_text(ctx->tok_ov_img_end); - } + // add overview image (last) + if (!ctx->ov_img_first) { + add_text(ctx->tok_ov_img_start); + cur.entries.emplace_back(std::move(ov_chunk)); + add_text(ctx->tok_ov_img_end); + } - } else { - size_t n_tokens = 0; - for (const auto & entry : batch_f32.entries) { - n_tokens += clip_n_output_tokens(ctx->ctx_v, entry.get()); - } + } else { + size_t n_tokens = 0; + for (const auto & entry : batch_f32.entries) { + n_tokens += clip_n_output_tokens(ctx->ctx_v, entry.get()); + } - mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens); - if (ctx->use_mrope) { - // for Qwen2VL, we need this information for M-RoPE decoding positions - image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get()); - image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_v, batch_f32.entries[0].get()); - image_tokens->use_mrope_pos = true; - } else { - // other models, we only need the total number of tokens - image_tokens->nx = n_tokens; - image_tokens->ny = 1; - } - image_tokens->batch_f32 = std::move(batch_f32); - image_tokens->id = bitmap->id; // optional + mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens); + if (ctx->use_mrope) { + // for Qwen2VL, we need this information for M-RoPE decoding positions + image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get()); + image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_v, batch_f32.entries[0].get()); + image_tokens->use_mrope_pos = true; + } else { + // other models, we only need the total number of tokens + image_tokens->nx = n_tokens; + image_tokens->ny = 1; + } + image_tokens->batch_f32 = std::move(batch_f32); + image_tokens->id = bitmap->id; // optional - LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx); - LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny); - LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size()); + LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx); + LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny); + LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size()); - mtmd_input_chunk chunk{ - MTMD_INPUT_CHUNK_TYPE_IMAGE, - {}, // text tokens - std::move(image_tokens), - nullptr, // audio tokens - }; - cur.entries.emplace_back(std::move(chunk)); - } + mtmd_input_chunk chunk{ + MTMD_INPUT_CHUNK_TYPE_IMAGE, + {}, // text tokens + std::move(image_tokens), + nullptr, // audio tokens + }; + cur.entries.emplace_back(std::move(chunk)); + } - if (!ctx->img_end.empty()) { - add_text(ctx->img_end, true); // add image end token - } + if (!ctx->img_end.empty()) { + add_text(ctx->img_end, true); // add image end token + } + return 0; + } - } else { + int32_t add_media(const mtmd_bitmap * bitmap) { + if(mtmd_bitmap_is_audio(bitmap)){ // handle audio if (!ctx->ctx_a) { @@ -690,9 +719,39 @@ struct mtmd_tokenizer { if (!ctx->aud_end.empty()) { add_text(ctx->aud_end, true); // add audio end token } + + return 0; } - return 0; + if(mtmd_bitmap_is_video(bitmap)){ + // handle video + if (!ctx->ctx_v) { + LOG_ERR("%s: error: model does not support vision input\n", __func__); + return 2; + } + + if ( + ctx->video_sch == MTMD_VIDEO_SCHEMA_MINICPMV_4_5 + ) { + // TODO + LOG_ERR("%s: error: video schema MINICPMV_4_5 is not implemented yet\n", __func__); + return 2; + }else{ + const auto frame_size = bitmap->nx * bitmap->ny * 3; + for(uint32_t f = 0; f < bitmap->nz; f++){ + // extract frame f + mtmd_bitmap* frame_bitmap = mtmd_bitmap_init(bitmap->nx, bitmap->ny, bitmap->data.data() + f * frame_size); + // minicpmv 2.6 video understanding: disable UHD slicing (overview only) + int32_t res = add_image(frame_bitmap, false /* have_slices */); + mtmd_bitmap_free(frame_bitmap); + if (res != 0) return res; + } + } + return 0; + } + + // handle image otherwise + return add_image(bitmap); } std::vector split_batch_to_chunk(clip_image_f32_batch && batch_f32, const std::string & id) { @@ -862,6 +921,7 @@ int mtmd_get_audio_bitrate(mtmd_context * ctx) { return 16000; // 16kHz } + // // public API functions // @@ -874,6 +934,8 @@ mtmd_bitmap * mtmd_bitmap_init(uint32_t nx, mtmd_bitmap * bitmap = new mtmd_bitmap; bitmap->nx = nx; bitmap->ny = ny; + bitmap->nz = 1; + bitmap->type = MTMD_BITMAP_TYPE_IMAGE; size_t data_size = (size_t)nx * ny * 3; bitmap->data.resize(data_size); std::memcpy(bitmap->data.data(), data, data_size); @@ -885,13 +947,29 @@ mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, mtmd_bitmap * bitmap = new mtmd_bitmap; bitmap->nx = n_samples; bitmap->ny = 1; - bitmap->is_audio = true; + bitmap->nz = 1; + bitmap->type = MTMD_BITMAP_TYPE_AUDIO; size_t data_size = n_samples * sizeof(float); bitmap->data.resize(data_size); std::memcpy(bitmap->data.data(), data, data_size); return bitmap; } +mtmd_bitmap * mtmd_bitmap_init_from_video(uint32_t nx, uint32_t ny, uint32_t nframes, const unsigned char * data){ + mtmd_bitmap * bitmap = new mtmd_bitmap; + bitmap->nx = nx; + bitmap->ny = ny; + bitmap->nz = nframes; + bitmap->type = MTMD_BITMAP_TYPE_VIDEO; + size_t data_size = (size_t)nx * ny * nframes * 3; + bitmap->data.resize(data_size); + if(data != nullptr){ + std::memcpy(bitmap->data.data(), data, data_size); + } + return bitmap; +} + + uint32_t mtmd_bitmap_get_nx(const mtmd_bitmap * bitmap) { return bitmap->nx; } @@ -909,7 +987,11 @@ size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap) { } bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap) { - return bitmap->is_audio; + return bitmap->type == MTMD_BITMAP_TYPE_AUDIO; +} + +bool mtmd_bitmap_is_video(const mtmd_bitmap * bitmap) { + return bitmap->type == MTMD_BITMAP_TYPE_VIDEO; } const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap) { @@ -930,6 +1012,20 @@ void mtmd_bitmap_free(mtmd_bitmap * bitmap) { } } +bool mtmd_bitmap_set_frame(mtmd_bitmap * bitmap, size_t i, const unsigned char * data) { + if (bitmap->type != MTMD_BITMAP_TYPE_VIDEO) { + LOG_ERR("%s: error: bitmap is not a video\n", __func__); + return false; + } + if (i >= (size_t)bitmap->nz) { + LOG_ERR("%s: error: frame index %zu out of range (nz = %u)\n", __func__, i, bitmap->nz); + return false; + } + size_t frame_size = (size_t)bitmap->nx * bitmap->ny * 3; + std::memcpy(bitmap->data.data() + i * frame_size, data, frame_size); + return true; +} + // mtmd_input_chunks mtmd_input_chunks * mtmd_input_chunks_init() { diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h index 775fba6215c7c..de5e32d06c780 100644 --- a/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h @@ -125,14 +125,23 @@ MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx); // if bitmap is audio: // length of data must be n_samples * sizeof(float) // the data is in float format (PCM F32) +// if bitmap is video: +// length of data must be nx * ny * nframes * 3 +// the data is in RGBRGBRGB... format for each frame, frames are stored sequentially +// if data is nullptr, an empty bitmap is created (you can init content later) MTMD_API mtmd_bitmap * mtmd_bitmap_init (uint32_t nx, uint32_t ny, const unsigned char * data); MTMD_API mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, const float * data); +MTMD_API mtmd_bitmap * mtmd_bitmap_init_from_video(uint32_t nx, uint32_t ny, uint32_t nframes, const unsigned char * data); MTMD_API uint32_t mtmd_bitmap_get_nx (const mtmd_bitmap * bitmap); MTMD_API uint32_t mtmd_bitmap_get_ny (const mtmd_bitmap * bitmap); MTMD_API const unsigned char * mtmd_bitmap_get_data (const mtmd_bitmap * bitmap); MTMD_API size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap); MTMD_API bool mtmd_bitmap_is_audio (const mtmd_bitmap * bitmap); +MTMD_API bool mtmd_bitmap_is_video (const mtmd_bitmap * bitmap); MTMD_API void mtmd_bitmap_free (mtmd_bitmap * bitmap); +// set single frame i of the video bitmap +// return false if bitmap is not video or i is out of range +MTMD_API bool mtmd_bitmap_set_frame (mtmd_bitmap * bitmap, size_t i, const unsigned char * data); // bitmap ID is optional, but useful for KV cache tracking // these getters/setters are dedicated functions, so you can for example calculate the hash of the image based on mtmd_bitmap_get_data() MTMD_API const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap); diff --git a/tools/server/server.cpp b/tools/server/server.cpp index a8d7773c96809..ccf633a299df7 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -3983,7 +3983,6 @@ struct server_context { } slot.n_prompt_tokens_processed += n_tokens_out; - // add the image chunk to cache { const auto & chunk = input_tokens.find_chunk(slot.prompt.n_tokens());