Skip to content

Commit efefc2a

Browse files
committed
feat&fix: fix segmentation fault bug, add llama-server video_url api
1 parent 816b92b commit efefc2a

File tree

4 files changed

+73
-18
lines changed

4 files changed

+73
-18
lines changed

tools/mtmd/mtmd-video.cpp

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -160,21 +160,31 @@ bool is_video_buffer(const uint8_t *data, size_t size){
160160
if (!data || size < 16) return false; // too short
161161

162162
AVProbeData probe;
163-
probe.buf = const_cast<uint8_t*>(data);
164-
probe.buf_size = (int)size;
165163
probe.filename = "";
166164

167165
// ffmpeg requires that the last AVPROBE_PADDING_SIZE bytes of the buffer must be 0
168-
std::vector<uint8_t> padded(size + AVPROBE_PADDING_SIZE);
169-
memcpy(padded.data(), data, size);
170-
memset(padded.data() + size, 0, AVPROBE_PADDING_SIZE);
171-
probe.buf = padded.data();
166+
auto * padded = new uint8_t[size + AVPROBE_PADDING_SIZE];
167+
memcpy(padded, data, size);
168+
memset(padded + size, 0, AVPROBE_PADDING_SIZE);
169+
probe.buf = padded;
172170
probe.buf_size = (int)size;
173171

174-
const AVInputFormat *fmt = av_probe_input_format(&probe, 1);
175-
if (!fmt) return false;
172+
int score = 0;
173+
const AVInputFormat *fmt = av_probe_input_format2(&probe, 1, &score);
174+
175+
delete [] padded;
176+
177+
if (!fmt || score < 25) return false;
176178
if (fmt->flags & AVFMT_NOFILE) return false;
177179

180+
static const char *image_formats[] = {
181+
"jpeg_pipe", "png_pipe", "bmp_pipe", "gif_pipe", "webp_pipe",
182+
"tiff_pipe", "image2", "image2pipe", "mjpeg"
183+
};
184+
for (auto name : image_formats)
185+
if (fmt->name && strstr(fmt->name, name))
186+
return false;
187+
178188
return true;
179189
}
180190

@@ -592,7 +602,6 @@ mtmd_bitmap* init_video_bitmap(mtmd_context * ctx, const uint8_t* buffer, size_t
592602
LOG_ERR("Unable to get video info from buffer\n");
593603
return nullptr;
594604
}
595-
printf("get info\n");
596605

597606
const auto opts = get_video_sample_options(info);
598607

tools/server/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -430,7 +430,7 @@ Multiple prompts are also supported. In this case, the completion result will be
430430
- Strings, JSON objects, and sequences of tokens: `["string1", [12, 34, 56], { "prompt_string": "string", "multimodal_data": ["base64"]}]`
431431
- Mixed types: `[[12, 34, "string", 56, 78], [12, 34, 56], "string", { "prompt_string": "string" }]`
432432

433-
Note for `multimodal_data` in JSON object prompts. This should be an array of strings, containing base64 encoded multimodal data such as images and audio. There must be an identical number of MTMD media markers in the string prompt element which act as placeholders for the data provided to this parameter. The multimodal data files will be substituted in order. The marker string (e.g. `<__media__>`) can be found by calling `mtmd_default_marker()` defined in [the MTMD C API](https://github.com/ggml-org/llama.cpp/blob/5fd160bbd9d70b94b5b11b0001fd7f477005e4a0/tools/mtmd/mtmd.h#L87). A client *must not* specify this field unless the server has the multimodal capability. Clients should check `/models` or `/v1/models` for the `multimodal` capability before a multimodal request.
433+
Note for `multimodal_data` in JSON object prompts. This should be an array of strings, containing base64 encoded multimodal data such as images, audio and video. There must be an identical number of MTMD media markers in the string prompt element which act as placeholders for the data provided to this parameter. The multimodal data files will be substituted in order. The marker string (e.g. `<__media__>`) can be found by calling `mtmd_default_marker()` defined in [the MTMD C API](https://github.com/ggml-org/llama.cpp/blob/5fd160bbd9d70b94b5b11b0001fd7f477005e4a0/tools/mtmd/mtmd.h#L87). A client *must not* specify this field unless the server has the multimodal capability. Clients should check `/models` or `/v1/models` for the `multimodal` capability before a multimodal request.
434434

435435
`temperature`: Adjust the randomness of the generated text. Default: `0.8`
436436

@@ -1210,7 +1210,7 @@ print(completion.choices[0].text)
12101210

12111211
Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.
12121212

1213-
If model supports multimodal, you can input the media file via `image_url` content part. We support both base64 and remote URL as input. See OAI documentation for more.
1213+
If model supports multimodal, you can input the media file via `image_url` or `video_url` content part. We support both base64 and remote URL as input. See OAI documentation for more.
12141214

12151215
*Options:*
12161216

tools/server/server.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3638,21 +3638,21 @@ struct server_context {
36383638
// remove the non-common part from the cache
36393639
slot.cache_tokens.keep_first(slot.n_past);
36403640

3641-
// check if we should process the image
3641+
// check if we should process the media chunk (image, audio, video, ...)
36423642
if (slot.n_past < slot.n_prompt_tokens && slot.prompt_tokens[slot.n_past] == LLAMA_TOKEN_NULL) {
3643-
// process the image
3643+
// process the media chunk
36443644
int32_t new_n_past;
36453645
int32_t res = slot.prompt_tokens.process_chunk(ctx, mctx, slot.n_past, slot.id, new_n_past);
36463646
int32_t n_pos = new_n_past - slot.n_past;
36473647

36483648
if (res != 0) {
3649-
SLT_ERR(slot, "failed to process image, res = %d\n", res);
3649+
SLT_ERR(slot, "failed to process media, res = %d\n", res);
36503650
slot.release();
3651-
send_error(slot, "failed to process image", ERROR_TYPE_SERVER);
3651+
send_error(slot, "failed to process media", ERROR_TYPE_SERVER);
36523652
continue;
36533653
}
36543654

3655-
// add the image chunk to cache
3655+
// add the media chunk to cache
36563656
{
36573657
const auto & chunk = slot.prompt_tokens.find_chunk(slot.n_past);
36583658
slot.cache_tokens.push_back(chunk.get()); // copy

tools/server/utils.hpp

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -677,7 +677,53 @@ static json oaicompat_chat_params_parse(
677677
p["text"] = mtmd_default_marker();
678678
p.erase("input_audio");
679679

680-
} else if (type != "text") {
680+
} else if (type == "video_url") {
681+
if (!opt.allow_image) { // TODO: separate video flag?
682+
throw std::runtime_error("video input is not supported - hint: if this is unexpected, you may need to provide the mmproj");
683+
}
684+
685+
json video_url = json_value(p, "video_url", json::object());
686+
std::string url = json_value(video_url, "url", std::string());
687+
if (string_starts_with(url, "http")) {
688+
// download remote image
689+
// TODO @ngxson : maybe make these params configurable
690+
common_remote_params params;
691+
params.headers.push_back("User-Agent: llama.cpp/" + build_info);
692+
params.max_size = 1024 * 1024 * 100; // 100MB
693+
params.timeout = 100; // seconds
694+
SRV_INF("downloading video from '%s'\n", url.c_str());
695+
auto res = common_remote_get_content(url, params);
696+
if (200 <= res.first && res.first < 300) {
697+
SRV_INF("downloaded %ld bytes\n", res.second.size());
698+
raw_buffer data;
699+
data.insert(data.end(), res.second.begin(), res.second.end());
700+
out_files.push_back(data);
701+
} else {
702+
throw std::runtime_error("Failed to download video");
703+
}
704+
705+
} else {
706+
// try to decode base64 video
707+
std::vector<std::string> parts = string_split<std::string>(url, /*separator*/ ',');
708+
if (parts.size() != 2) {
709+
throw std::runtime_error("Invalid video_url.url value");
710+
} else if (!string_starts_with(parts[0], "data:video/")) {
711+
throw std::runtime_error("Invalid video_url.url format: " + parts[0]);
712+
} else if (!string_ends_with(parts[0], "base64")) {
713+
throw std::runtime_error("video_url.url must be base64 encoded");
714+
} else {
715+
auto base64_data = parts[1];
716+
auto decoded_data = base64_decode(base64_data);
717+
out_files.push_back(decoded_data);
718+
}
719+
}
720+
721+
// replace this chunk with a marker
722+
p["type"] = "text";
723+
p["text"] = mtmd_default_marker();
724+
p.erase("video_url");
725+
726+
}else if (type != "text") {
681727
throw std::runtime_error("unsupported content[].type");
682728
}
683729
}
@@ -1401,7 +1447,7 @@ static server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt
14011447
for (auto & file : files) {
14021448
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(mctx, file.data(), file.size()));
14031449
if (!bmp.ptr) {
1404-
throw std::runtime_error("Failed to load image or audio file");
1450+
throw std::runtime_error("Failed to load media file");
14051451
}
14061452
// calculate bitmap hash (for KV caching)
14071453
std::string hash = fnv_hash(bmp.data(), bmp.n_bytes());

0 commit comments

Comments
 (0)