diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index bd3da64df..e1a447550 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -2800,6 +2800,7 @@ def __init__(self, clip_model_path: str, verbose: bool = True): self._mtmd_cpp = mtmd_cpp self._exit_stack = ExitStack() self.mtmd_ctx: Optional[mtmd_cpp.mtmd_context_p] = None + self.extra_template_arguments: dict[str, Any] = {} if not os.path.exists(clip_model_path): raise ValueError(f"Clip model path does not exist: {clip_model_path}") @@ -2931,9 +2932,11 @@ def __call__( # Replace image URLs with media markers in the template text = template.render( messages=messages, + tools=tools, add_generation_prompt=True, eos_token=llama.detokenize([llama.token_eos()]), bos_token=llama.detokenize([llama.token_bos()]), + **self.extra_template_arguments ) # Replace image URLs in text with media markers @@ -3696,61 +3699,116 @@ def __call__(self, **kwargs): class Qwen3VLChatHandler(Llava15ChatHandler): - DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant." - - CHAT_FORMAT_BASE = ( - "{% set image_count = namespace(value=0) %}" - "{% for message in messages %}" - "{% if loop.first and message['role'] != 'system' %}" - "<|im_start|>system\n" - "{{ self.DEFAULT_SYSTEM_MESSAGE }}<|im_end|>\n" - "{% endif %}" - "<|im_start|>{{ message['role'] }}\n" - "{% if message['content'] is string %}" - "{{ message['content'] }}<|im_end|>\n" - "{% else %}" - "{% for content in message['content'] %}" - "{% if content['type'] == 'image_url' %}" - "{% if content.image_url is string %}" - "{% set image_count.value = image_count.value + 1 %}" - "Picture {{ image_count.value }}: <|vision_start|> {{ content.image_url }} <|vision_end|>" - "{% else %}" - "{% set image_count.value = image_count.value + 1 %}" - "Picture {{ image_count.value }}: <|vision_start|> {{ content.image_url.url }} <|vision_end|>" - "{% endif %}" - "{% elif content['type'] == 'text' %}" - "{{ content['text'] }}" - "{% endif %}" - "{% endfor %}" - "<|im_end|>\n" - "{% endif %}" - "{% endfor %}" + CHAT_FORMAT = ( + "{{- '<|im_start|>system\n' -}}" + "{%- if messages[0].content is string and messages[0].role == 'system' -%}" + "{{- messages[0].content -}}" + "{%- elif messages[0].role == 'system' -%}" + "{%- if 'text' in messages[0].content -%}" + "{{- messages[0].content.text -}}" + "{%- else -%}" + "{{- 'You are a helpful assistant.' -}}" + "{%- endif -%}" + "{%- endif -%}" + "{%- if tools -%}" + "{{- '\n\n' -}}" + "{{- '# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n' -}}" + "{%- for tool in tools -%}" + "{{- '\n' -}}" + "{{- tool | tojson -}}" + "{%- endfor -%}" + "{{- '\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n\n\nYou can also return a response for the user alongside a function call:\n\n\n{\"name\": , \"arguments\": }\n' -}}" + "{%- endif -%}" + "{{- '<|im_end|>\n' -}}" + "{%- set image_count = namespace(value=0) -%}" + #"{%- set video_count = namespace(value=0) -%}" + "{%- for message in messages -%}" + "{%- if message.role == 'tool' -%}" + "{{- '<|im_start|>user\n\n' -}}" + "{%- elif message.role != 'system' -%}" + "{{- '<|im_start|>' + message.role + '\n' -}}" + "{%- endif -%}" + "{%- if message.content is string and message.role != 'system' -%}" + "{{- message.content -}}" + "{%- elif message.role != 'system' -%}" + "{%- for content in message.content -%}" + "{%- if 'image_url' in content -%}" + "{%- set image_count.value = image_count.value + 1 -%}" + "{%- if add_vision_id -%}" + "{{- 'Picture ' -}}" + "{{- image_count.value | string -}}" + "{{- ': ' -}}" + "{%- endif -%}" + "{{- '<|vision_start|>' -}}" + "{%- if content.image_url is string -%}" + "{{- content.image_url -}}" + "{%- else -%}" + "{{- content.image_url.url -}}" + "{%- endif -%}" + "{{- '<|vision_end|>' -}}" + "{%- endif -%}" + # Video not supported yet + "{%- if 'text' in content -%}" + "{{- content.text -}}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- endif -%}" + "{%- if message.role == 'assistant' -%}" + "{%- if message.tool_calls -%}" + "{%- for tool_call in message.tool_calls -%}" + "{%- if (loop.first and message.content) or (not loop.first) -%}" + "{{- '\n' -}}" + "{%- endif -%}" + "{%- if tool_call.function -%}" + "{%- set tool_call = tool_call.function -%}" + "{%- endif -%}" + "{{- '\n{\"name\": \"' + tool_call.name + '\", \"arguments\": ' -}}" + "{%- if tool_call.arguments is string -%}" + "{{- tool_call.arguments -}}" + "{%- else -%}" + "{{- tool_call.arguments | tojson -}}" + "{%- endif -%}" + "{{- '}\n' -}}" + "{%- endfor -%}" + "{%- endif -%}" + "{%- elif message.role == 'tool' -%}" + "{{- '' -}}" + "{%- elif message.role != 'system' -%}" + "{{- '<|im_end|>\n' -}}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- if add_generation_prompt -%}" + "{{- 'assistant\n' -}}" + "{%- if force_reasoning -%}" + "{{- '\n' -}}" + "{%- endif -%}" + "{%- endif -%}" ) def __init__( self, - use_think_prompt: bool = True, - verbose: bool = True, + force_reasoning: bool = False, + add_vision_id: bool = True, **kwargs, ): """ Parameters: - - use_think_prompt (bool): - - True (default): Use the '' prompt (for Thinking version). - - False: Do not use '' (for Instruct version). - - verbose (bool): Whether to print verbose logs. + - force_reasoning (bool): + - True: Force the reasoning in the model by adding to the chat template. + - False (default): Don't force the reasoning. + - add_vision_id (bool): + - True (default): Count all the images. Recommended for multi-image. + - False: Doesn't count the images. Can save tokens with single-image. """ - self.use_think_prompt = use_think_prompt - self.verbose = verbose - - if self.use_think_prompt: - self.CHAT_FORMAT = self.CHAT_FORMAT_BASE + "<|im_start|>assistant\n\n" - else: - self.CHAT_FORMAT = self.CHAT_FORMAT_BASE + "<|im_start|>assistant\n" + self.force_reasoning = force_reasoning + self.add_vision_id = add_vision_id super().__init__(**kwargs) def __call__(self, **kwargs): + self.extra_template_arguments["force_reasoning"] = self.force_reasoning + self.extra_template_arguments["add_vision_id"] = self.add_vision_id + llama = kwargs['llama'] # Clear state for multiple runs @@ -3770,9 +3828,9 @@ def __call__(self, **kwargs): messages = kwargs.get('messages', []) try: image_count = len(self.get_image_urls(messages)) - print(f"Qwen3VLHandler(think={self.use_think_prompt}) - Cleared state, processing {image_count} images", file=sys.stderr) + print(f"Qwen3VLHandler(force_reasoning={self.force_reasoning}) - Cleared state, processing {image_count} images", file=sys.stderr) except Exception: - print(f"Qwen3VLHandler(think={self.use_think_prompt}) - Cleared state", file=sys.stderr) + print(f"Qwen3VLHandler(force_reasoning={self.force_reasoning}) - Cleared state", file=sys.stderr) # Use parent implementation return super().__call__(**kwargs)