Skip to content

Commit 17ba24f

Browse files
authored
Merge pull request #17 from TAO71-AI/main
Better Qwen3VL chat template.
2 parents a1c6dc2 + 14d14cc commit 17ba24f

File tree

1 file changed

+102
-44
lines changed

1 file changed

+102
-44
lines changed

llama_cpp/llama_chat_format.py

Lines changed: 102 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -2800,6 +2800,7 @@ def __init__(self, clip_model_path: str, verbose: bool = True):
28002800
self._mtmd_cpp = mtmd_cpp
28012801
self._exit_stack = ExitStack()
28022802
self.mtmd_ctx: Optional[mtmd_cpp.mtmd_context_p] = None
2803+
self.extra_template_arguments: dict[str, Any] = {}
28032804

28042805
if not os.path.exists(clip_model_path):
28052806
raise ValueError(f"Clip model path does not exist: {clip_model_path}")
@@ -2931,9 +2932,11 @@ def __call__(
29312932
# Replace image URLs with media markers in the template
29322933
text = template.render(
29332934
messages=messages,
2935+
tools=tools,
29342936
add_generation_prompt=True,
29352937
eos_token=llama.detokenize([llama.token_eos()]),
29362938
bos_token=llama.detokenize([llama.token_bos()]),
2939+
**self.extra_template_arguments
29372940
)
29382941

29392942
# Replace image URLs in text with media markers
@@ -3696,61 +3699,116 @@ def __call__(self, **kwargs):
36963699

36973700

36983701
class Qwen3VLChatHandler(Llava15ChatHandler):
3699-
DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant."
3700-
3701-
CHAT_FORMAT_BASE = (
3702-
"{% set image_count = namespace(value=0) %}"
3703-
"{% for message in messages %}"
3704-
"{% if loop.first and message['role'] != 'system' %}"
3705-
"<|im_start|>system\n"
3706-
"{{ self.DEFAULT_SYSTEM_MESSAGE }}<|im_end|>\n"
3707-
"{% endif %}"
3708-
"<|im_start|>{{ message['role'] }}\n"
3709-
"{% if message['content'] is string %}"
3710-
"{{ message['content'] }}<|im_end|>\n"
3711-
"{% else %}"
3712-
"{% for content in message['content'] %}"
3713-
"{% if content['type'] == 'image_url' %}"
3714-
"{% if content.image_url is string %}"
3715-
"{% set image_count.value = image_count.value + 1 %}"
3716-
"Picture {{ image_count.value }}: <|vision_start|> {{ content.image_url }} <|vision_end|>"
3717-
"{% else %}"
3718-
"{% set image_count.value = image_count.value + 1 %}"
3719-
"Picture {{ image_count.value }}: <|vision_start|> {{ content.image_url.url }} <|vision_end|>"
3720-
"{% endif %}"
3721-
"{% elif content['type'] == 'text' %}"
3722-
"{{ content['text'] }}"
3723-
"{% endif %}"
3724-
"{% endfor %}"
3725-
"<|im_end|>\n"
3726-
"{% endif %}"
3727-
"{% endfor %}"
3702+
CHAT_FORMAT = (
3703+
"{{- '<|im_start|>system\n' -}}"
3704+
"{%- if messages[0].content is string and messages[0].role == 'system' -%}"
3705+
"{{- messages[0].content -}}"
3706+
"{%- elif messages[0].role == 'system' -%}"
3707+
"{%- if 'text' in messages[0].content -%}"
3708+
"{{- messages[0].content.text -}}"
3709+
"{%- else -%}"
3710+
"{{- 'You are a helpful assistant.' -}}"
3711+
"{%- endif -%}"
3712+
"{%- endif -%}"
3713+
"{%- if tools -%}"
3714+
"{{- '\n\n' -}}"
3715+
"{{- '# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>' -}}"
3716+
"{%- for tool in tools -%}"
3717+
"{{- '\n' -}}"
3718+
"{{- tool | tojson -}}"
3719+
"{%- endfor -%}"
3720+
"{{- '\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <arguments-json-object>}\n</tool_call>\n\nYou can also return a response for the user alongside a function call:\n<response-for-user>\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <arguments-json-object>}\n</tool_call>' -}}"
3721+
"{%- endif -%}"
3722+
"{{- '<|im_end|>\n' -}}"
3723+
"{%- set image_count = namespace(value=0) -%}"
3724+
#"{%- set video_count = namespace(value=0) -%}"
3725+
"{%- for message in messages -%}"
3726+
"{%- if message.role == 'tool' -%}"
3727+
"{{- '<|im_start|>user\n<tool_response>\n' -}}"
3728+
"{%- elif message.role != 'system' -%}"
3729+
"{{- '<|im_start|>' + message.role + '\n' -}}"
3730+
"{%- endif -%}"
3731+
"{%- if message.content is string and message.role != 'system' -%}"
3732+
"{{- message.content -}}"
3733+
"{%- elif message.role != 'system' -%}"
3734+
"{%- for content in message.content -%}"
3735+
"{%- if 'image_url' in content -%}"
3736+
"{%- set image_count.value = image_count.value + 1 -%}"
3737+
"{%- if add_vision_id -%}"
3738+
"{{- 'Picture ' -}}"
3739+
"{{- image_count.value | string -}}"
3740+
"{{- ': ' -}}"
3741+
"{%- endif -%}"
3742+
"{{- '<|vision_start|>' -}}"
3743+
"{%- if content.image_url is string -%}"
3744+
"{{- content.image_url -}}"
3745+
"{%- else -%}"
3746+
"{{- content.image_url.url -}}"
3747+
"{%- endif -%}"
3748+
"{{- '<|vision_end|>' -}}"
3749+
"{%- endif -%}"
3750+
# Video not supported yet
3751+
"{%- if 'text' in content -%}"
3752+
"{{- content.text -}}"
3753+
"{%- endif -%}"
3754+
"{%- endfor -%}"
3755+
"{%- endif -%}"
3756+
"{%- if message.role == 'assistant' -%}"
3757+
"{%- if message.tool_calls -%}"
3758+
"{%- for tool_call in message.tool_calls -%}"
3759+
"{%- if (loop.first and message.content) or (not loop.first) -%}"
3760+
"{{- '\n' -}}"
3761+
"{%- endif -%}"
3762+
"{%- if tool_call.function -%}"
3763+
"{%- set tool_call = tool_call.function -%}"
3764+
"{%- endif -%}"
3765+
"{{- '<tool_call>\n{\"name\": \"' + tool_call.name + '\", \"arguments\": ' -}}"
3766+
"{%- if tool_call.arguments is string -%}"
3767+
"{{- tool_call.arguments -}}"
3768+
"{%- else -%}"
3769+
"{{- tool_call.arguments | tojson -}}"
3770+
"{%- endif -%}"
3771+
"{{- '}\n</tool_call>' -}}"
3772+
"{%- endfor -%}"
3773+
"{%- endif -%}"
3774+
"{%- elif message.role == 'tool' -%}"
3775+
"{{- '</tool_response>' -}}"
3776+
"{%- elif message.role != 'system' -%}"
3777+
"{{- '<|im_end|>\n' -}}"
3778+
"{%- endif -%}"
3779+
"{%- endfor -%}"
3780+
"{%- if add_generation_prompt -%}"
3781+
"{{- '<im_start>assistant\n' -}}"
3782+
"{%- if force_reasoning -%}"
3783+
"{{- '<think>\n' -}}"
3784+
"{%- endif -%}"
3785+
"{%- endif -%}"
37283786
)
37293787

37303788
def __init__(
37313789
self,
3732-
use_think_prompt: bool = True,
3733-
verbose: bool = True,
3790+
force_reasoning: bool = False,
3791+
add_vision_id: bool = True,
37343792
**kwargs,
37353793
):
37363794
"""
37373795
Parameters:
3738-
- use_think_prompt (bool):
3739-
- True (default): Use the '<think>' prompt (for Thinking version).
3740-
- False: Do not use '<think>' (for Instruct version).
3741-
- verbose (bool): Whether to print verbose logs.
3796+
- force_reasoning (bool):
3797+
- True: Force the reasoning in the model by adding <think> to the chat template.
3798+
- False (default): Don't force the reasoning.
3799+
- add_vision_id (bool):
3800+
- True (default): Count all the images. Recommended for multi-image.
3801+
- False: Doesn't count the images. Can save tokens with single-image.
37423802
"""
3743-
self.use_think_prompt = use_think_prompt
3744-
self.verbose = verbose
3745-
3746-
if self.use_think_prompt:
3747-
self.CHAT_FORMAT = self.CHAT_FORMAT_BASE + "<|im_start|>assistant\n<think>\n"
3748-
else:
3749-
self.CHAT_FORMAT = self.CHAT_FORMAT_BASE + "<|im_start|>assistant\n"
3803+
self.force_reasoning = force_reasoning
3804+
self.add_vision_id = add_vision_id
37503805

37513806
super().__init__(**kwargs)
37523807

37533808
def __call__(self, **kwargs):
3809+
self.extra_template_arguments["force_reasoning"] = self.force_reasoning
3810+
self.extra_template_arguments["add_vision_id"] = self.add_vision_id
3811+
37543812
llama = kwargs['llama']
37553813

37563814
# Clear state for multiple runs
@@ -3770,9 +3828,9 @@ def __call__(self, **kwargs):
37703828
messages = kwargs.get('messages', [])
37713829
try:
37723830
image_count = len(self.get_image_urls(messages))
3773-
print(f"Qwen3VLHandler(think={self.use_think_prompt}) - Cleared state, processing {image_count} images", file=sys.stderr)
3831+
print(f"Qwen3VLHandler(force_reasoning={self.force_reasoning}) - Cleared state, processing {image_count} images", file=sys.stderr)
37743832
except Exception:
3775-
print(f"Qwen3VLHandler(think={self.use_think_prompt}) - Cleared state", file=sys.stderr)
3833+
print(f"Qwen3VLHandler(force_reasoning={self.force_reasoning}) - Cleared state", file=sys.stderr)
37763834

37773835
# Use parent implementation
37783836
return super().__call__(**kwargs)

0 commit comments

Comments
 (0)