Better Qwen3VL chat template.

alcoftTAO · alcoftTAO · commit 1d41825851d3 · 2025-11-05T19:26:38.000+01:00
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
@@ -2800,6 +2800,7 @@ def __init__(self, clip_model_path: str, verbose: bool = True):
         self._mtmd_cpp = mtmd_cpp
         self._exit_stack = ExitStack()
         self.mtmd_ctx: Optional[mtmd_cpp.mtmd_context_p] = None
+        self.extra_template_arguments: dict[str, Any] = {}
 
         if not os.path.exists(clip_model_path):
             raise ValueError(f"Clip model path does not exist: {clip_model_path}")
@@ -2931,9 +2932,11 @@ def __call__(
         # Replace image URLs with media markers in the template
         text = template.render(
             messages=messages,
+            tools=tools,
             add_generation_prompt=True,
             eos_token=llama.detokenize([llama.token_eos()]),
             bos_token=llama.detokenize([llama.token_bos()]),
+            **self.extra_template_arguments
         )
 
         # Replace image URLs in text with media markers
@@ -3696,61 +3699,101 @@ def __call__(self, **kwargs):
 
 
 class Qwen3VLChatHandler(Llava15ChatHandler):
-    DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant."
-
-    CHAT_FORMAT_BASE = (
-        "{% set image_count = namespace(value=0) %}"
-        "{% for message in messages %}"
-        "{% if loop.first and message['role'] != 'system' %}"
-        "<|im_start|>system\n"
-        "{{ self.DEFAULT_SYSTEM_MESSAGE }}<|im_end|>\n"
-        "{% endif %}"
-        "<|im_start|>{{ message['role'] }}\n"
-        "{% if message['content'] is string %}"
-        "{{ message['content'] }}<|im_end|>\n"
-        "{% else %}"
-        "{% for content in message['content'] %}"
-        "{% if content['type'] == 'image_url' %}"
-        "{% if content.image_url is string %}"
-        "{% set image_count.value = image_count.value + 1 %}"
-        "Picture {{ image_count.value }}: <|vision_start|> {{ content.image_url }} <|vision_end|>"
-        "{% else %}"
-        "{% set image_count.value = image_count.value + 1 %}"
-        "Picture {{ image_count.value }}: <|vision_start|> {{ content.image_url.url }} <|vision_end|>"
-        "{% endif %}"
-        "{% elif content['type'] == 'text' %}"
-        "{{ content['text'] }}"
-        "{% endif %}"
-        "{% endfor %}"
-        "<|im_end|>\n"
-        "{% endif %}"
-        "{% endfor %}"
+    CHAT_FORMAT = (
+        "{{- '<|im_start|>system\n' -}}"
+        "{%- if messages[0].content is string and messages[0].role == 'system' -%}"
+            "{{- messages[0].content -}}"
+        "{%- elif messages[0].role == 'system' -%}"
+            "{%- if 'text' in messages[0].content -%}"
+                "{{- messages[0].content.text -}}"
+            "{%- else -%}"
+                "{{- 'You are a helpful assistant.' -}}"
+            "{%- endif -%}"
+        "{%- endif -%}"
+        "{%- if tools -%}"
+            "{{- '\n\n' -}}"
+            "{{- '# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>' -}}"
+            "{%- for tool in tools -%}"
+                "{{- '\n' -}}"
+                "{{- tool | tojson -}}"
+            "{%- endfor -%}"
+            "{{- '\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <arguments-json-object>}\n</tool_call>\n\nYou can also return a response for the user alongside a function call:\n<response-for-user>\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <arguments-json-object>}\n</tool_call>' -}}"
+        "{%- endif -%}"
+        #"{%- if thinking_budget -%}"
+        #    "{{- '\n\n# Reasoning\n\nYou must generate your reasoning steps within <think></think> XML tags:\n<think>\n<reasoning-content>\n</think>\n<final-response>\n\nThe reasoning content must not exceed the ' + thinking_budget + ' tokens budget.' -}}"
+        #"{%- endif -%}"  # Doesn't work very well, disabled for now
+        "{{- '<|im_end|>\n' -}}"
+        "{%- set image_count = namespace(value=0) -%}"
+        #"{%- set video_count = namespace(value=0) -%}"
+        "{%- for message in messages -%}"
+            "{%- if message.role == 'tool' -%}"
+                "{{- '<|im_start|>user\n<tool_response>\n' -}}"
+            "{%- elif message.role != 'system' -%}"
+                "{{- '<|im_start|>' + message.role + '\n' -}}"
+            "{%- endif -%}"
+            "{%- if message.content is string and message.role != 'system' -%}"
+                "{{- message.content -}}"
+            "{%- elif message.role != 'system' -%}"
+                "{%- for content in message.content -%}"
+                    "{%- if 'image_url' in content -%}"
+                        "{%- set image_count.value = image_count.value + 1 -%}"
+                        "{%- if add_vision_id -%}"
+                            "{{- 'Picture ' + image_count.value + ': ' -}}"
+                        "{%- endif -%}"
+                        "{{- '<|vision_start|>' -}}"
+                        "{%- if content.image_url is string -%}"
+                            "{{- content.image_url -}}"
+                        "{%- else -%}"
+                            "{{- content.image_url.url -}}"
+                        "{%- endif -%}"
+                        "{{- '<|vision_end|>' -}}"
+                    "{%- endif -%}"
+                    # Video not supported yet
+                    "{%- if 'text' in content -%}"
+                        "{{- content.text -}}"
+                    "{%- endif -%}"
+                "{%- endfor -%}"
+            "{%- endif -%}"
+            "{%- if message.role == 'assistant' -%}"
+                "{%- if message.tool_calls -%}"
+                    "{%- for tool_call in message.tool_calls -%}"
+                        "{%- if (loop.first and message.content) or (not loop.first) -%}"
+                            "{{- '\n' -}}"
+                        "{%- endif -%}"
+                        "{%- if tool_call.function -%}"
+                            "{%- set tool_call = tool_call.function -%}"
+                        "{%- endif -%}"
+                        "{{- '<tool_call>\n{\"name\": \"' + tool_call.name + '\", \"arguments\": ' -}}"
+                        "{%- if tool_call.arguments is string -%}"
+                            "{{- tool_call.arguments -}}"
+                        "{%- else -%}"
+                            "{{- tool_call.arguments | tojson -}}"
+                        "{%- endif -%}"
+                        "{{- '}\n</tool_call>' -}}"
+                    "{%- endfor -%}"
+                "{%- endif -%}"
+            "{%- elif message.role == 'tool' -%}"
+                "{{- '</tool_response>' -}}"
+            "{%- elif message.role != 'system' -%}"
+                "{{- '<|im_end|>\n' -}}"
+            "{%- endif -%}"
+        "{%- endfor -%}"
+        "{{- '<im_start>assistant\n' -}}"
+        # The thinking model doesn't need the <think></think> tags in this template; the model generates the tags during inference when needed
     )
 
     def __init__(
         self,
-        use_think_prompt: bool = True,
-        verbose: bool = True,
+        thinking_budget: int | None = None,
         **kwargs,
     ):
-        """
-        Parameters:
-        - use_think_prompt (bool):
-            - True (default): Use the '<think>' prompt (for Thinking version).
-            - False: Do not use '<think>'              (for Instruct version).
-        - verbose (bool): Whether to print verbose logs.
-        """
-        self.use_think_prompt = use_think_prompt
-        self.verbose = verbose
-
-        if self.use_think_prompt:
-            self.CHAT_FORMAT = self.CHAT_FORMAT_BASE + "<|im_start|>assistant\n<think>\n"
-        else:
-            self.CHAT_FORMAT = self.CHAT_FORMAT_BASE + "<|im_start|>assistant\n"
-
+        self.thinking_budget = thinking_budget
         super().__init__(**kwargs)
 
     def __call__(self, **kwargs):
+        if self.thinking_budget is not None:
+            self.extra_template_arguments["thinking_budget"] = str(self.thinking_budget)
+
         llama = kwargs['llama']
 
         # Clear state for multiple runs
@@ -3770,9 +3813,9 @@ def __call__(self, **kwargs):
             messages = kwargs.get('messages', [])
             try:
                 image_count = len(self.get_image_urls(messages))
-                print(f"Qwen3VLHandler(think={self.use_think_prompt}) - Cleared state, processing {image_count} images", file=sys.stderr)
+                print(f"Qwen3VLHandler(thinking_budget={self.thinking_budget}) - Cleared state, processing {image_count} images", file=sys.stderr)
             except Exception:
-                print(f"Qwen3VLHandler(think={self.use_think_prompt}) - Cleared state", file=sys.stderr)
+                print(f"Qwen3VLHandler(thinking_budget={self.thinking_budget}) - Cleared state", file=sys.stderr)
 
         # Use parent implementation
         return super().__call__(**kwargs)