feat&fix: fix segmentation fault bug, add llama-server video_url api

MrAMS · MrAMS · commit efefc2a2756d · 2025-10-31T19:36:37.000+08:00
diff --git a/tools/mtmd/mtmd-video.cpp b/tools/mtmd/mtmd-video.cpp
@@ -160,21 +160,31 @@ bool is_video_buffer(const uint8_t *data, size_t size){
     if (!data || size < 16) return false; // too short
 
     AVProbeData probe;
-    probe.buf = const_cast<uint8_t*>(data);
-    probe.buf_size = (int)size;
     probe.filename = "";
 
     // ffmpeg requires that the last AVPROBE_PADDING_SIZE bytes of the buffer must be 0
-    std::vector<uint8_t> padded(size + AVPROBE_PADDING_SIZE);
-    memcpy(padded.data(), data, size);
-    memset(padded.data() + size, 0, AVPROBE_PADDING_SIZE);
-    probe.buf = padded.data();
+    auto * padded = new uint8_t[size + AVPROBE_PADDING_SIZE];
+    memcpy(padded, data, size);
+    memset(padded + size, 0, AVPROBE_PADDING_SIZE);
+    probe.buf = padded;
     probe.buf_size = (int)size;
 
-    const AVInputFormat *fmt = av_probe_input_format(&probe, 1);
-    if (!fmt) return false;
+    int score = 0;
+    const AVInputFormat *fmt = av_probe_input_format2(&probe, 1, &score);
+
+    delete [] padded;
+    
+    if (!fmt || score < 25) return false;
     if (fmt->flags & AVFMT_NOFILE) return false;
 
+    static const char *image_formats[] = {
+        "jpeg_pipe", "png_pipe", "bmp_pipe", "gif_pipe", "webp_pipe",
+        "tiff_pipe", "image2", "image2pipe", "mjpeg"
+    };
+    for (auto name : image_formats)
+        if (fmt->name && strstr(fmt->name, name))
+            return false;
+
     return true;
 }
 
@@ -592,7 +602,6 @@ mtmd_bitmap* init_video_bitmap(mtmd_context * ctx, const uint8_t* buffer, size_t
         LOG_ERR("Unable to get video info from buffer\n");
         return nullptr;
     }
-    printf("get info\n");
 
     const auto opts = get_video_sample_options(info);
 
diff --git a/tools/server/README.md b/tools/server/README.md
@@ -430,7 +430,7 @@ Multiple prompts are also supported. In this case, the completion result will be
   - Strings, JSON objects, and sequences of tokens: `["string1", [12, 34, 56], { "prompt_string": "string", "multimodal_data": ["base64"]}]`
   - Mixed types: `[[12, 34, "string", 56, 78], [12, 34, 56], "string", { "prompt_string": "string" }]`
 
-Note for `multimodal_data` in JSON object prompts. This should be an array of strings, containing base64 encoded multimodal data such as images and audio. There must be an identical number of MTMD media markers in the string prompt element which act as placeholders for the data provided to this parameter. The multimodal data files will be substituted in order. The marker string (e.g. `<__media__>`) can be found by calling `mtmd_default_marker()` defined in [the MTMD C API](https://github.com/ggml-org/llama.cpp/blob/5fd160bbd9d70b94b5b11b0001fd7f477005e4a0/tools/mtmd/mtmd.h#L87). A client *must not* specify this field unless the server has the multimodal capability. Clients should check `/models` or `/v1/models` for the `multimodal` capability before a multimodal request.
+Note for `multimodal_data` in JSON object prompts. This should be an array of strings, containing base64 encoded multimodal data such as images, audio and video. There must be an identical number of MTMD media markers in the string prompt element which act as placeholders for the data provided to this parameter. The multimodal data files will be substituted in order. The marker string (e.g. `<__media__>`) can be found by calling `mtmd_default_marker()` defined in [the MTMD C API](https://github.com/ggml-org/llama.cpp/blob/5fd160bbd9d70b94b5b11b0001fd7f477005e4a0/tools/mtmd/mtmd.h#L87). A client *must not* specify this field unless the server has the multimodal capability. Clients should check `/models` or `/v1/models` for the `multimodal` capability before a multimodal request.
 
 `temperature`: Adjust the randomness of the generated text. Default: `0.8`
 
@@ -1210,7 +1210,7 @@ print(completion.choices[0].text)
 
 Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.
 
-If model supports multimodal, you can input the media file via `image_url` content part. We support both base64 and remote URL as input. See OAI documentation for more.
+If model supports multimodal, you can input the media file via `image_url` or `video_url` content part. We support both base64 and remote URL as input. See OAI documentation for more.
 
 *Options:*
 
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
@@ -3638,21 +3638,21 @@ struct server_context {
                     // remove the non-common part from the cache
                     slot.cache_tokens.keep_first(slot.n_past);
 
-                    // check if we should process the image
+                    // check if we should process the media chunk (image, audio, video, ...)
                     if (slot.n_past < slot.n_prompt_tokens && slot.prompt_tokens[slot.n_past] == LLAMA_TOKEN_NULL) {
-                        // process the image
+                        // process the media chunk
                         int32_t new_n_past;
                         int32_t res = slot.prompt_tokens.process_chunk(ctx, mctx, slot.n_past, slot.id, new_n_past);
                         int32_t n_pos = new_n_past - slot.n_past;
 
                         if (res != 0) {
-                            SLT_ERR(slot, "failed to process image, res = %d\n", res);
+                            SLT_ERR(slot, "failed to process media, res = %d\n", res);
                             slot.release();
-                            send_error(slot, "failed to process image", ERROR_TYPE_SERVER);
+                            send_error(slot, "failed to process media", ERROR_TYPE_SERVER);
                             continue;
                         }
 
-                        // add the image chunk to cache
+                        // add the media chunk to cache
                         {
                             const auto & chunk = slot.prompt_tokens.find_chunk(slot.n_past);
                             slot.cache_tokens.push_back(chunk.get()); // copy
diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
@@ -677,7 +677,53 @@ static json oaicompat_chat_params_parse(
                 p["text"] = mtmd_default_marker();
                 p.erase("input_audio");
 
-            } else if (type != "text") {
+            } else if (type == "video_url") {
+                if (!opt.allow_image) { // TODO: separate video flag?
+                    throw std::runtime_error("video input is not supported - hint: if this is unexpected, you may need to provide the mmproj");
+                }
+
+                json video_url  = json_value(p, "video_url", json::object());
+                std::string url = json_value(video_url, "url", std::string());
+                if (string_starts_with(url, "http")) {
+                    // download remote image
+                    // TODO @ngxson : maybe make these params configurable
+                    common_remote_params params;
+                    params.headers.push_back("User-Agent: llama.cpp/" + build_info);
+                    params.max_size = 1024 * 1024 * 100; // 100MB
+                    params.timeout  = 100; // seconds
+                    SRV_INF("downloading video from '%s'\n", url.c_str());
+                    auto res = common_remote_get_content(url, params);
+                    if (200 <= res.first && res.first < 300) {
+                        SRV_INF("downloaded %ld bytes\n", res.second.size());
+                        raw_buffer data;
+                        data.insert(data.end(), res.second.begin(), res.second.end());
+                        out_files.push_back(data);
+                    } else {
+                        throw std::runtime_error("Failed to download video");
+                    }
+
+                } else {
+                    // try to decode base64 video
+                    std::vector<std::string> parts = string_split<std::string>(url, /*separator*/ ',');
+                    if (parts.size() != 2) {
+                        throw std::runtime_error("Invalid video_url.url value");
+                    } else if (!string_starts_with(parts[0], "data:video/")) {
+                        throw std::runtime_error("Invalid video_url.url format: " + parts[0]);
+                    } else if (!string_ends_with(parts[0], "base64")) {
+                        throw std::runtime_error("video_url.url must be base64 encoded");
+                    } else {
+                        auto base64_data = parts[1];
+                        auto decoded_data = base64_decode(base64_data);
+                        out_files.push_back(decoded_data);
+                    }
+                }
+
+                // replace this chunk with a marker
+                p["type"] = "text";
+                p["text"] = mtmd_default_marker();
+                p.erase("video_url");
+
+            }else if (type != "text") {
                 throw std::runtime_error("unsupported content[].type");
             }
         }
@@ -1401,7 +1447,7 @@ static server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt
     for (auto & file : files) {
         mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(mctx, file.data(), file.size()));
         if (!bmp.ptr) {
-            throw std::runtime_error("Failed to load image or audio file");
+            throw std::runtime_error("Failed to load media file");
         }
         // calculate bitmap hash (for KV caching)
         std::string hash = fnv_hash(bmp.data(), bmp.n_bytes());