1414#include < limits>
1515#include < vector>
1616
17- // represents raw image data, layout is RGBRGBRGB...
18- // length of data must be nx * ny * 3
17+ enum class mtmd_bitmap_type {
18+ IMAGE,
19+ AUDIO,
20+ VIDEO,
21+ };
22+
23+ // if bitmap is image:
24+ // length of data must be nx * ny * 3
25+ // the data is in RGBRGBRGB... format
26+ // if bitmap is audio:
27+ // length of data must be n_samples * sizeof(float)
28+ // the data is in float format (PCM F32)
29+ // if bitmap is video:
30+ // length of data must be nx * ny * nframes * 3
31+ // the data is in RGBRGBRGB... format for each frame, frames are stored sequentially
1932struct mtmd_bitmap {
33+ mtmd_bitmap_type type;
2034 uint32_t nx;
21- uint32_t ny;
35+ uint32_t ny; // for audio, ny=1
36+ uint32_t nz; // for video: number of frames
2237 std::vector<unsigned char > data;
2338 std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
24- bool is_audio = false ; // true if the bitmap is audio
25-
26- bool is_video = false ; // true if the bitmap is video
27- uint32_t nz; // for video: number of frames
2839};
2940
3041struct mtmd_image_tokens {
@@ -901,8 +912,7 @@ mtmd_bitmap * mtmd_bitmap_init(uint32_t nx,
901912 bitmap->nx = nx;
902913 bitmap->ny = ny;
903914 bitmap->nz = 1 ;
904- bitmap->is_audio = false ;
905- bitmap->is_video = false ;
915+ bitmap->type = mtmd_bitmap_type::IMAGE;
906916 size_t data_size = (size_t )nx * ny * 3 ;
907917 bitmap->data .resize (data_size);
908918 std::memcpy (bitmap->data .data (), data, data_size);
@@ -915,8 +925,7 @@ mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples,
915925 bitmap->nx = n_samples;
916926 bitmap->ny = 1 ;
917927 bitmap->nz = 1 ;
918- bitmap->is_audio = true ;
919- bitmap->is_video = false ;
928+ bitmap->type = mtmd_bitmap_type::AUDIO;
920929 size_t data_size = n_samples * sizeof (float );
921930 bitmap->data .resize (data_size);
922931 std::memcpy (bitmap->data .data (), data, data_size);
@@ -928,8 +937,7 @@ mtmd_bitmap * mtmd_bitmap_init_from_video(uint32_t nx, uint32_t ny, uint32_t nfr
928937 bitmap->nx = nx;
929938 bitmap->ny = ny;
930939 bitmap->nz = nframes;
931- bitmap->is_audio = false ;
932- bitmap->is_video = true ;
940+ bitmap->type = mtmd_bitmap_type::VIDEO;
933941 size_t data_size = (size_t )nx * ny * nframes * 3 ;
934942 bitmap->data .resize (data_size);
935943 if (data != nullptr ){
@@ -960,11 +968,11 @@ size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap) {
960968}
961969
962970bool mtmd_bitmap_is_audio (const mtmd_bitmap * bitmap) {
963- return bitmap->is_audio ;
971+ return bitmap->type == mtmd_bitmap_type::AUDIO ;
964972}
965973
966974bool mtmd_bitmap_is_video (const mtmd_bitmap * bitmap) {
967- return bitmap->is_video ;
975+ return bitmap->type == mtmd_bitmap_type::VIDEO ;
968976}
969977
970978const char * mtmd_bitmap_get_id (const mtmd_bitmap * bitmap) {
0 commit comments