Skip to content

Commit 02cc8cb

Browse files
authored
feat(llama.cpp): consolidate options and respect tokenizer template when enabled (#7120)
* feat(llama.cpp): expose env vars as options for consistency This allows to configure everything in the YAML file of the model rather than have global configurations Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(llama.cpp): respect usetokenizertemplate and use llama.cpp templating system to process messages Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * WIP Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Detect template exists if use tokenizer template is enabled Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Better recognization of chat Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Fixes to support tool calls while using templates from tokenizer Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Fixups Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Drop template guessing, fix passing tools to tokenizer Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Extract grammar and other options from chat template, add schema struct Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * WIP Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * WIP Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Automatically set use_jinja Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Cleanups, identify by default gguf models for chat Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Update docs Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
1 parent e5e86d0 commit 02cc8cb

File tree

17 files changed

+974
-545
lines changed

17 files changed

+974
-545
lines changed

backend/backend.proto

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,8 @@ message PredictOptions {
154154
repeated string Videos = 45;
155155
repeated string Audios = 46;
156156
string CorrelationId = 47;
157+
string Tools = 48; // JSON array of available tools/functions for tool calling
158+
string ToolChoice = 49; // JSON string or object specifying tool choice behavior
157159
}
158160

159161
// The response message containing the result
@@ -382,6 +384,11 @@ message StatusResponse {
382384
message Message {
383385
string role = 1;
384386
string content = 2;
387+
// Optional fields for OpenAI-compatible message format
388+
string name = 3; // Tool name (for tool messages)
389+
string tool_call_id = 4; // Tool call ID (for tool messages)
390+
string reasoning_content = 5; // Reasoning content (for thinking models)
391+
string tool_calls = 6; // Tool calls as JSON string (for assistant messages with tool calls)
385392
}
386393

387394
message DetectOptions {

backend/cpp/llama-cpp/grpc-server.cpp

Lines changed: 495 additions & 73 deletions
Large diffs are not rendered by default.

core/backend/llm.go

Lines changed: 5 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@ package backend
22

33
import (
44
"context"
5-
"encoding/json"
6-
"fmt"
75
"regexp"
86
"slices"
97
"strings"
@@ -35,7 +33,7 @@ type TokenUsage struct {
3533
TimingTokenGeneration float64
3634
}
3735

38-
func ModelInference(ctx context.Context, s string, messages []schema.Message, images, videos, audios []string, loader *model.ModelLoader, c *config.ModelConfig, cl *config.ModelConfigLoader, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
36+
func ModelInference(ctx context.Context, s string, messages schema.Messages, images, videos, audios []string, loader *model.ModelLoader, c *config.ModelConfig, cl *config.ModelConfigLoader, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool, tools string, toolChoice string) (func() (LLMResponse, error), error) {
3937
modelFile := c.Model
4038

4139
// Check if the modelFile exists, if it doesn't try to load it from the gallery
@@ -65,29 +63,8 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
6563
var protoMessages []*proto.Message
6664
// if we are using the tokenizer template, we need to convert the messages to proto messages
6765
// unless the prompt has already been tokenized (non-chat endpoints + functions)
68-
if c.TemplateConfig.UseTokenizerTemplate && s == "" {
69-
protoMessages = make([]*proto.Message, len(messages), len(messages))
70-
for i, message := range messages {
71-
protoMessages[i] = &proto.Message{
72-
Role: message.Role,
73-
}
74-
switch ct := message.Content.(type) {
75-
case string:
76-
protoMessages[i].Content = ct
77-
case []interface{}:
78-
// If using the tokenizer template, in case of multimodal we want to keep the multimodal content as and return only strings here
79-
data, _ := json.Marshal(ct)
80-
resultData := []struct {
81-
Text string `json:"text"`
82-
}{}
83-
json.Unmarshal(data, &resultData)
84-
for _, r := range resultData {
85-
protoMessages[i].Content += r.Text
86-
}
87-
default:
88-
return nil, fmt.Errorf("unsupported type for schema.Message.Content for inference: %T", ct)
89-
}
90-
}
66+
if c.TemplateConfig.UseTokenizerTemplate && len(messages) > 0 {
67+
protoMessages = messages.ToProto()
9168
}
9269

9370
// in GRPC, the backend is supposed to answer to 1 single token if stream is not supported
@@ -99,6 +76,8 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
9976
opts.Images = images
10077
opts.Videos = videos
10178
opts.Audios = audios
79+
opts.Tools = tools
80+
opts.ToolChoice = toolChoice
10281

10382
tokenUsage := TokenUsage{}
10483

core/config/gguf.go

Lines changed: 5 additions & 211 deletions
Original file line numberDiff line numberDiff line change
@@ -1,151 +1,17 @@
11
package config
22

33
import (
4-
"strings"
5-
64
"github.com/mudler/LocalAI/pkg/xsysinfo"
75
"github.com/rs/zerolog/log"
86

97
gguf "github.com/gpustack/gguf-parser-go"
108
)
119

12-
type familyType uint8
13-
14-
const (
15-
Unknown familyType = iota
16-
LLaMa3
17-
CommandR
18-
Phi3
19-
ChatML
20-
Mistral03
21-
Gemma
22-
DeepSeek2
23-
)
24-
2510
const (
2611
defaultContextSize = 1024
2712
defaultNGPULayers = 99999999
2813
)
2914

30-
type settingsConfig struct {
31-
StopWords []string
32-
TemplateConfig TemplateConfig
33-
RepeatPenalty float64
34-
}
35-
36-
// default settings to adopt with a given model family
37-
var defaultsSettings map[familyType]settingsConfig = map[familyType]settingsConfig{
38-
Gemma: {
39-
RepeatPenalty: 1.0,
40-
StopWords: []string{"<|im_end|>", "<end_of_turn>", "<start_of_turn>"},
41-
TemplateConfig: TemplateConfig{
42-
Chat: "{{.Input }}\n<start_of_turn>model\n",
43-
ChatMessage: "<start_of_turn>{{if eq .RoleName \"assistant\" }}model{{else}}{{ .RoleName }}{{end}}\n{{ if .Content -}}\n{{.Content -}}\n{{ end -}}<end_of_turn>",
44-
Completion: "{{.Input}}",
45-
},
46-
},
47-
DeepSeek2: {
48-
StopWords: []string{"<|end▁of▁sentence|>"},
49-
TemplateConfig: TemplateConfig{
50-
ChatMessage: `{{if eq .RoleName "user" -}}User: {{.Content }}
51-
{{ end -}}
52-
{{if eq .RoleName "assistant" -}}Assistant: {{.Content}}<|end▁of▁sentence|>{{end}}
53-
{{if eq .RoleName "system" -}}{{.Content}}
54-
{{end -}}`,
55-
Chat: "{{.Input -}}\nAssistant: ",
56-
},
57-
},
58-
LLaMa3: {
59-
StopWords: []string{"<|eot_id|>"},
60-
TemplateConfig: TemplateConfig{
61-
Chat: "<|begin_of_text|>{{.Input }}\n<|start_header_id|>assistant<|end_header_id|>",
62-
ChatMessage: "<|start_header_id|>{{ .RoleName }}<|end_header_id|>\n\n{{.Content }}<|eot_id|>",
63-
},
64-
},
65-
CommandR: {
66-
TemplateConfig: TemplateConfig{
67-
Chat: "{{.Input -}}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
68-
Functions: `<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
69-
You are a function calling AI model, you can call the following functions:
70-
## Available Tools
71-
{{range .Functions}}
72-
- {"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}
73-
{{end}}
74-
When using a tool, reply with JSON, for instance {"name": "tool_name", "arguments": {"param1": "value1", "param2": "value2"}}
75-
<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Input -}}`,
76-
ChatMessage: `{{if eq .RoleName "user" -}}
77-
<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
78-
{{- else if eq .RoleName "system" -}}
79-
<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
80-
{{- else if eq .RoleName "assistant" -}}
81-
<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
82-
{{- else if eq .RoleName "tool" -}}
83-
<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
84-
{{- else if .FunctionCall -}}
85-
<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{toJson .FunctionCall}}}<|END_OF_TURN_TOKEN|>
86-
{{- end -}}`,
87-
},
88-
StopWords: []string{"<|END_OF_TURN_TOKEN|>"},
89-
},
90-
Phi3: {
91-
TemplateConfig: TemplateConfig{
92-
Chat: "{{.Input}}\n<|assistant|>",
93-
ChatMessage: "<|{{ .RoleName }}|>\n{{.Content}}<|end|>",
94-
Completion: "{{.Input}}",
95-
},
96-
StopWords: []string{"<|end|>", "<|endoftext|>"},
97-
},
98-
ChatML: {
99-
TemplateConfig: TemplateConfig{
100-
Chat: "{{.Input -}}\n<|im_start|>assistant",
101-
Functions: `<|im_start|>system
102-
You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
103-
{{range .Functions}}
104-
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
105-
{{end}}
106-
For each function call return a json object with function name and arguments
107-
<|im_end|>
108-
{{.Input -}}
109-
<|im_start|>assistant`,
110-
ChatMessage: `<|im_start|>{{ .RoleName }}
111-
{{ if .FunctionCall -}}
112-
Function call:
113-
{{ else if eq .RoleName "tool" -}}
114-
Function response:
115-
{{ end -}}
116-
{{ if .Content -}}
117-
{{.Content }}
118-
{{ end -}}
119-
{{ if .FunctionCall -}}
120-
{{toJson .FunctionCall}}
121-
{{ end -}}<|im_end|>`,
122-
},
123-
StopWords: []string{"<|im_end|>", "<dummy32000>", "</s>"},
124-
},
125-
Mistral03: {
126-
TemplateConfig: TemplateConfig{
127-
Chat: "{{.Input -}}",
128-
Functions: `[AVAILABLE_TOOLS] [{{range .Functions}}{"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}{{end}} ] [/AVAILABLE_TOOLS]{{.Input }}`,
129-
ChatMessage: `{{if eq .RoleName "user" -}}
130-
[INST] {{.Content }} [/INST]
131-
{{- else if .FunctionCall -}}
132-
[TOOL_CALLS] {{toJson .FunctionCall}} [/TOOL_CALLS]
133-
{{- else if eq .RoleName "tool" -}}
134-
[TOOL_RESULTS] {{.Content}} [/TOOL_RESULTS]
135-
{{- else -}}
136-
{{ .Content -}}
137-
{{ end -}}`,
138-
},
139-
StopWords: []string{"<|im_end|>", "<dummy32000>", "</tool_call>", "<|eot_id|>", "<|end_of_text|>", "</s>", "[/TOOL_CALLS]", "[/ACTIONS]"},
140-
},
141-
}
142-
143-
// this maps well known template used in HF to model families defined above
144-
var knownTemplates = map[string]familyType{
145-
`{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}`: ChatML,
146-
`{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}`: Mistral03,
147-
}
148-
14915
func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
15016

15117
if defaultCtx == 0 && cfg.ContextSize == nil {
@@ -216,81 +82,9 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
21682
cfg.Name = f.Metadata().Name
21783
}
21884

219-
family := identifyFamily(f)
220-
221-
if family == Unknown {
222-
log.Debug().Msgf("guessDefaultsFromFile: %s", "family not identified")
223-
return
224-
}
225-
226-
// identify template
227-
settings, ok := defaultsSettings[family]
228-
if ok {
229-
cfg.TemplateConfig = settings.TemplateConfig
230-
log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: guessed template %+v", cfg.TemplateConfig)
231-
if len(cfg.StopWords) == 0 {
232-
cfg.StopWords = settings.StopWords
233-
}
234-
if cfg.RepeatPenalty == 0.0 {
235-
cfg.RepeatPenalty = settings.RepeatPenalty
236-
}
237-
} else {
238-
log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: no template found for family")
239-
}
240-
241-
if cfg.HasTemplate() {
242-
return
243-
}
244-
245-
// identify from well known templates first, otherwise use the raw jinja template
246-
chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
247-
if found {
248-
// try to use the jinja template
249-
cfg.TemplateConfig.JinjaTemplate = true
250-
cfg.TemplateConfig.ChatMessage = chatTemplate.ValueString()
251-
}
252-
253-
}
254-
255-
func identifyFamily(f *gguf.GGUFFile) familyType {
256-
257-
// identify from well known templates first
258-
chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
259-
if found && chatTemplate.ValueString() != "" {
260-
if family, ok := knownTemplates[chatTemplate.ValueString()]; ok {
261-
return family
262-
}
263-
}
264-
265-
// otherwise try to identify from the model properties
266-
arch := f.Architecture().Architecture
267-
eosTokenID := f.Tokenizer().EOSTokenID
268-
bosTokenID := f.Tokenizer().BOSTokenID
269-
270-
isYI := arch == "llama" && bosTokenID == 1 && eosTokenID == 2
271-
// WTF! Mistral0.3 and isYi have same bosTokenID and eosTokenID
272-
273-
llama3 := arch == "llama" && eosTokenID == 128009
274-
commandR := arch == "command-r" && eosTokenID == 255001
275-
qwen2 := arch == "qwen2"
276-
phi3 := arch == "phi-3"
277-
gemma := strings.HasPrefix(arch, "gemma") || strings.Contains(strings.ToLower(f.Metadata().Name), "gemma")
278-
deepseek2 := arch == "deepseek2"
279-
280-
switch {
281-
case deepseek2:
282-
return DeepSeek2
283-
case gemma:
284-
return Gemma
285-
case llama3:
286-
return LLaMa3
287-
case commandR:
288-
return CommandR
289-
case phi3:
290-
return Phi3
291-
case qwen2, isYI:
292-
return ChatML
293-
default:
294-
return Unknown
295-
}
85+
// Instruct to use template from llama.cpp
86+
cfg.TemplateConfig.UseTokenizerTemplate = true
87+
cfg.FunctionsConfig.GrammarConfig.NoGrammar = true
88+
cfg.Options = append(cfg.Options, "use_jinja:true")
89+
cfg.KnownUsecaseStrings = append(cfg.KnownUsecaseStrings, "FLAG_CHAT")
29690
}

core/config/model_config.go

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -265,19 +265,10 @@ type TemplateConfig struct {
265265

266266
Multimodal string `yaml:"multimodal" json:"multimodal"`
267267

268-
JinjaTemplate bool `yaml:"jinja_template" json:"jinja_template"`
269-
270268
ReplyPrefix string `yaml:"reply_prefix" json:"reply_prefix"`
271269
}
272270

273-
func (c *ModelConfig) UnmarshalYAML(value *yaml.Node) error {
274-
type BCAlias ModelConfig
275-
var aux BCAlias
276-
if err := value.Decode(&aux); err != nil {
277-
return err
278-
}
279-
*c = ModelConfig(aux)
280-
271+
func (c *ModelConfig) syncKnownUsecasesFromString() {
281272
c.KnownUsecases = GetUsecasesFromYAML(c.KnownUsecaseStrings)
282273
// Make sure the usecases are valid, we rewrite with what we identified
283274
c.KnownUsecaseStrings = []string{}
@@ -286,6 +277,17 @@ func (c *ModelConfig) UnmarshalYAML(value *yaml.Node) error {
286277
c.KnownUsecaseStrings = append(c.KnownUsecaseStrings, k)
287278
}
288279
}
280+
}
281+
282+
func (c *ModelConfig) UnmarshalYAML(value *yaml.Node) error {
283+
type BCAlias ModelConfig
284+
var aux BCAlias
285+
if err := value.Decode(&aux); err != nil {
286+
return err
287+
}
288+
*c = ModelConfig(aux)
289+
290+
c.syncKnownUsecasesFromString()
289291
return nil
290292
}
291293

@@ -462,6 +464,7 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
462464
}
463465

464466
guessDefaultsFromFile(cfg, lo.modelPath, ctx)
467+
cfg.syncKnownUsecasesFromString()
465468
}
466469

467470
func (c *ModelConfig) Validate() bool {
@@ -492,7 +495,7 @@ func (c *ModelConfig) Validate() bool {
492495
}
493496

494497
func (c *ModelConfig) HasTemplate() bool {
495-
return c.TemplateConfig.Completion != "" || c.TemplateConfig.Edit != "" || c.TemplateConfig.Chat != "" || c.TemplateConfig.ChatMessage != ""
498+
return c.TemplateConfig.Completion != "" || c.TemplateConfig.Edit != "" || c.TemplateConfig.Chat != "" || c.TemplateConfig.ChatMessage != "" || c.TemplateConfig.UseTokenizerTemplate
496499
}
497500

498501
func (c *ModelConfig) GetModelConfigFile() string {
@@ -573,7 +576,7 @@ func (c *ModelConfig) HasUsecases(u ModelConfigUsecases) bool {
573576
// This avoids the maintenance burden of updating this list for each new backend - but unfortunately, that's the best option for some services currently.
574577
func (c *ModelConfig) GuessUsecases(u ModelConfigUsecases) bool {
575578
if (u & FLAG_CHAT) == FLAG_CHAT {
576-
if c.TemplateConfig.Chat == "" && c.TemplateConfig.ChatMessage == "" {
579+
if c.TemplateConfig.Chat == "" && c.TemplateConfig.ChatMessage == "" && !c.TemplateConfig.UseTokenizerTemplate {
577580
return false
578581
}
579582
}

0 commit comments

Comments
 (0)