Skip to content

Commit 1d41825

Browse files
committed
Better Qwen3VL chat template.
1 parent 6f8ec8b commit 1d41825

File tree

1 file changed

+91
-48
lines changed

1 file changed

+91
-48
lines changed

llama_cpp/llama_chat_format.py

Lines changed: 91 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -2800,6 +2800,7 @@ def __init__(self, clip_model_path: str, verbose: bool = True):
28002800
self._mtmd_cpp = mtmd_cpp
28012801
self._exit_stack = ExitStack()
28022802
self.mtmd_ctx: Optional[mtmd_cpp.mtmd_context_p] = None
2803+
self.extra_template_arguments: dict[str, Any] = {}
28032804

28042805
if not os.path.exists(clip_model_path):
28052806
raise ValueError(f"Clip model path does not exist: {clip_model_path}")
@@ -2931,9 +2932,11 @@ def __call__(
29312932
# Replace image URLs with media markers in the template
29322933
text = template.render(
29332934
messages=messages,
2935+
tools=tools,
29342936
add_generation_prompt=True,
29352937
eos_token=llama.detokenize([llama.token_eos()]),
29362938
bos_token=llama.detokenize([llama.token_bos()]),
2939+
**self.extra_template_arguments
29372940
)
29382941

29392942
# Replace image URLs in text with media markers
@@ -3696,61 +3699,101 @@ def __call__(self, **kwargs):
36963699

36973700

36983701
class Qwen3VLChatHandler(Llava15ChatHandler):
3699-
DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant."
3700-
3701-
CHAT_FORMAT_BASE = (
3702-
"{% set image_count = namespace(value=0) %}"
3703-
"{% for message in messages %}"
3704-
"{% if loop.first and message['role'] != 'system' %}"
3705-
"<|im_start|>system\n"
3706-
"{{ self.DEFAULT_SYSTEM_MESSAGE }}<|im_end|>\n"
3707-
"{% endif %}"
3708-
"<|im_start|>{{ message['role'] }}\n"
3709-
"{% if message['content'] is string %}"
3710-
"{{ message['content'] }}<|im_end|>\n"
3711-
"{% else %}"
3712-
"{% for content in message['content'] %}"
3713-
"{% if content['type'] == 'image_url' %}"
3714-
"{% if content.image_url is string %}"
3715-
"{% set image_count.value = image_count.value + 1 %}"
3716-
"Picture {{ image_count.value }}: <|vision_start|> {{ content.image_url }} <|vision_end|>"
3717-
"{% else %}"
3718-
"{% set image_count.value = image_count.value + 1 %}"
3719-
"Picture {{ image_count.value }}: <|vision_start|> {{ content.image_url.url }} <|vision_end|>"
3720-
"{% endif %}"
3721-
"{% elif content['type'] == 'text' %}"
3722-
"{{ content['text'] }}"
3723-
"{% endif %}"
3724-
"{% endfor %}"
3725-
"<|im_end|>\n"
3726-
"{% endif %}"
3727-
"{% endfor %}"
3702+
CHAT_FORMAT = (
3703+
"{{- '<|im_start|>system\n' -}}"
3704+
"{%- if messages[0].content is string and messages[0].role == 'system' -%}"
3705+
"{{- messages[0].content -}}"
3706+
"{%- elif messages[0].role == 'system' -%}"
3707+
"{%- if 'text' in messages[0].content -%}"
3708+
"{{- messages[0].content.text -}}"
3709+
"{%- else -%}"
3710+
"{{- 'You are a helpful assistant.' -}}"
3711+
"{%- endif -%}"
3712+
"{%- endif -%}"
3713+
"{%- if tools -%}"
3714+
"{{- '\n\n' -}}"
3715+
"{{- '# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>' -}}"
3716+
"{%- for tool in tools -%}"
3717+
"{{- '\n' -}}"
3718+
"{{- tool | tojson -}}"
3719+
"{%- endfor -%}"
3720+
"{{- '\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <arguments-json-object>}\n</tool_call>\n\nYou can also return a response for the user alongside a function call:\n<response-for-user>\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <arguments-json-object>}\n</tool_call>' -}}"
3721+
"{%- endif -%}"
3722+
#"{%- if thinking_budget -%}"
3723+
# "{{- '\n\n# Reasoning\n\nYou must generate your reasoning steps within <think></think> XML tags:\n<think>\n<reasoning-content>\n</think>\n<final-response>\n\nThe reasoning content must not exceed the ' + thinking_budget + ' tokens budget.' -}}"
3724+
#"{%- endif -%}" # Doesn't work very well, disabled for now
3725+
"{{- '<|im_end|>\n' -}}"
3726+
"{%- set image_count = namespace(value=0) -%}"
3727+
#"{%- set video_count = namespace(value=0) -%}"
3728+
"{%- for message in messages -%}"
3729+
"{%- if message.role == 'tool' -%}"
3730+
"{{- '<|im_start|>user\n<tool_response>\n' -}}"
3731+
"{%- elif message.role != 'system' -%}"
3732+
"{{- '<|im_start|>' + message.role + '\n' -}}"
3733+
"{%- endif -%}"
3734+
"{%- if message.content is string and message.role != 'system' -%}"
3735+
"{{- message.content -}}"
3736+
"{%- elif message.role != 'system' -%}"
3737+
"{%- for content in message.content -%}"
3738+
"{%- if 'image_url' in content -%}"
3739+
"{%- set image_count.value = image_count.value + 1 -%}"
3740+
"{%- if add_vision_id -%}"
3741+
"{{- 'Picture ' + image_count.value + ': ' -}}"
3742+
"{%- endif -%}"
3743+
"{{- '<|vision_start|>' -}}"
3744+
"{%- if content.image_url is string -%}"
3745+
"{{- content.image_url -}}"
3746+
"{%- else -%}"
3747+
"{{- content.image_url.url -}}"
3748+
"{%- endif -%}"
3749+
"{{- '<|vision_end|>' -}}"
3750+
"{%- endif -%}"
3751+
# Video not supported yet
3752+
"{%- if 'text' in content -%}"
3753+
"{{- content.text -}}"
3754+
"{%- endif -%}"
3755+
"{%- endfor -%}"
3756+
"{%- endif -%}"
3757+
"{%- if message.role == 'assistant' -%}"
3758+
"{%- if message.tool_calls -%}"
3759+
"{%- for tool_call in message.tool_calls -%}"
3760+
"{%- if (loop.first and message.content) or (not loop.first) -%}"
3761+
"{{- '\n' -}}"
3762+
"{%- endif -%}"
3763+
"{%- if tool_call.function -%}"
3764+
"{%- set tool_call = tool_call.function -%}"
3765+
"{%- endif -%}"
3766+
"{{- '<tool_call>\n{\"name\": \"' + tool_call.name + '\", \"arguments\": ' -}}"
3767+
"{%- if tool_call.arguments is string -%}"
3768+
"{{- tool_call.arguments -}}"
3769+
"{%- else -%}"
3770+
"{{- tool_call.arguments | tojson -}}"
3771+
"{%- endif -%}"
3772+
"{{- '}\n</tool_call>' -}}"
3773+
"{%- endfor -%}"
3774+
"{%- endif -%}"
3775+
"{%- elif message.role == 'tool' -%}"
3776+
"{{- '</tool_response>' -}}"
3777+
"{%- elif message.role != 'system' -%}"
3778+
"{{- '<|im_end|>\n' -}}"
3779+
"{%- endif -%}"
3780+
"{%- endfor -%}"
3781+
"{{- '<im_start>assistant\n' -}}"
3782+
# The thinking model doesn't need the <think></think> tags in this template; the model generates the tags during inference when needed
37283783
)
37293784

37303785
def __init__(
37313786
self,
3732-
use_think_prompt: bool = True,
3733-
verbose: bool = True,
3787+
thinking_budget: int | None = None,
37343788
**kwargs,
37353789
):
3736-
"""
3737-
Parameters:
3738-
- use_think_prompt (bool):
3739-
- True (default): Use the '<think>' prompt (for Thinking version).
3740-
- False: Do not use '<think>' (for Instruct version).
3741-
- verbose (bool): Whether to print verbose logs.
3742-
"""
3743-
self.use_think_prompt = use_think_prompt
3744-
self.verbose = verbose
3745-
3746-
if self.use_think_prompt:
3747-
self.CHAT_FORMAT = self.CHAT_FORMAT_BASE + "<|im_start|>assistant\n<think>\n"
3748-
else:
3749-
self.CHAT_FORMAT = self.CHAT_FORMAT_BASE + "<|im_start|>assistant\n"
3750-
3790+
self.thinking_budget = thinking_budget
37513791
super().__init__(**kwargs)
37523792

37533793
def __call__(self, **kwargs):
3794+
if self.thinking_budget is not None:
3795+
self.extra_template_arguments["thinking_budget"] = str(self.thinking_budget)
3796+
37543797
llama = kwargs['llama']
37553798

37563799
# Clear state for multiple runs
@@ -3770,9 +3813,9 @@ def __call__(self, **kwargs):
37703813
messages = kwargs.get('messages', [])
37713814
try:
37723815
image_count = len(self.get_image_urls(messages))
3773-
print(f"Qwen3VLHandler(think={self.use_think_prompt}) - Cleared state, processing {image_count} images", file=sys.stderr)
3816+
print(f"Qwen3VLHandler(thinking_budget={self.thinking_budget}) - Cleared state, processing {image_count} images", file=sys.stderr)
37743817
except Exception:
3775-
print(f"Qwen3VLHandler(think={self.use_think_prompt}) - Cleared state", file=sys.stderr)
3818+
print(f"Qwen3VLHandler(thinking_budget={self.thinking_budget}) - Cleared state", file=sys.stderr)
37763819

37773820
# Use parent implementation
37783821
return super().__call__(**kwargs)

0 commit comments

Comments
 (0)