mudler
diff --git a/‎backend/backend.proto‎
Lines changed: 7 additions & 0 deletions b/‎backend/backend.proto‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎backend/cpp/llama-cpp/grpc-server.cpp‎
Lines changed: 495 additions & 73 deletions b/‎backend/cpp/llama-cpp/grpc-server.cpp‎
Lines changed: 495 additions & 73 deletions
diff --git a/‎core/backend/llm.go‎
Lines changed: 5 additions & 26 deletions b/‎core/backend/llm.go‎
Lines changed: 5 additions & 26 deletions
diff --git a/‎core/config/gguf.go‎
Lines changed: 5 additions & 211 deletions b/‎core/config/gguf.go‎
Lines changed: 5 additions & 211 deletions
diff --git a/‎core/config/model_config.go‎
Lines changed: 15 additions & 12 deletions b/‎core/config/model_config.go‎
Lines changed: 15 additions & 12 deletions
@@ -154,6 +154,8 @@ message PredictOptions {
   repeated string Videos = 45;
   repeated string Audios = 46;
   string CorrelationId = 47;
+  string Tools = 48;  // JSON array of available tools/functions for tool calling
+  string ToolChoice = 49;  // JSON string or object specifying tool choice behavior
 }
 
 // The response message containing the result
@@ -382,6 +384,11 @@ message StatusResponse {
 message Message {
   string role = 1;
   string content = 2;
+  // Optional fields for OpenAI-compatible message format
+  string name = 3;                    // Tool name (for tool messages)
+  string tool_call_id = 4;            // Tool call ID (for tool messages)
+  string reasoning_content = 5;       // Reasoning content (for thinking models)
+  string tool_calls = 6;              // Tool calls as JSON string (for assistant messages with tool calls)
 }
 
 message DetectOptions {
 
@@ -2,8 +2,6 @@ package backend
 
 import (
 	"context"
-	"encoding/json"
-	"fmt"
 	"regexp"
 	"slices"
 	"strings"
@@ -35,7 +33,7 @@ type TokenUsage struct {
 	TimingTokenGeneration  float64
 }
 
-func ModelInference(ctx context.Context, s string, messages []schema.Message, images, videos, audios []string, loader *model.ModelLoader, c *config.ModelConfig, cl *config.ModelConfigLoader, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
+func ModelInference(ctx context.Context, s string, messages schema.Messages, images, videos, audios []string, loader *model.ModelLoader, c *config.ModelConfig, cl *config.ModelConfigLoader, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool, tools string, toolChoice string) (func() (LLMResponse, error), error) {
 	modelFile := c.Model
 
 	// Check if the modelFile exists, if it doesn't try to load it from the gallery
@@ -65,29 +63,8 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 	var protoMessages []*proto.Message
 	// if we are using the tokenizer template, we need to convert the messages to proto messages
 	// unless the prompt has already been tokenized (non-chat endpoints + functions)
-	if c.TemplateConfig.UseTokenizerTemplate && s == "" {
-		protoMessages = make([]*proto.Message, len(messages), len(messages))
-		for i, message := range messages {
-			protoMessages[i] = &proto.Message{
-				Role: message.Role,
-			}
-			switch ct := message.Content.(type) {
-			case string:
-				protoMessages[i].Content = ct
-			case []interface{}:
-				// If using the tokenizer template, in case of multimodal we want to keep the multimodal content as and return only strings here
-				data, _ := json.Marshal(ct)
-				resultData := []struct {
-					Text string `json:"text"`
-				}{}
-				json.Unmarshal(data, &resultData)
-				for _, r := range resultData {
-					protoMessages[i].Content += r.Text
-				}
-			default:
-				return nil, fmt.Errorf("unsupported type for schema.Message.Content for inference: %T", ct)
-			}
-		}
+	if c.TemplateConfig.UseTokenizerTemplate && len(messages) > 0 {
+		protoMessages = messages.ToProto()
 	}
 
 	// in GRPC, the backend is supposed to answer to 1 single token if stream is not supported
@@ -99,6 +76,8 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 		opts.Images = images
 		opts.Videos = videos
 		opts.Audios = audios
+		opts.Tools = tools
+		opts.ToolChoice = toolChoice
 
 		tokenUsage := TokenUsage{}
 
 
@@ -1,151 +1,17 @@
 package config
 
 import (
-	"strings"
-
 	"github.com/mudler/LocalAI/pkg/xsysinfo"
 	"github.com/rs/zerolog/log"
 
 	gguf "github.com/gpustack/gguf-parser-go"
 )
 
-type familyType uint8
-
-const (
-	Unknown familyType = iota
-	LLaMa3
-	CommandR
-	Phi3
-	ChatML
-	Mistral03
-	Gemma
-	DeepSeek2
-)
-
 const (
 	defaultContextSize = 1024
 	defaultNGPULayers  = 99999999
 )
 
-type settingsConfig struct {
-	StopWords      []string
-	TemplateConfig TemplateConfig
-	RepeatPenalty  float64
-}
-
-// default settings to adopt with a given model family
-var defaultsSettings map[familyType]settingsConfig = map[familyType]settingsConfig{
-	Gemma: {
-		RepeatPenalty: 1.0,
-		StopWords:     []string{"<|im_end|>", "<end_of_turn>", "<start_of_turn>"},
-		TemplateConfig: TemplateConfig{
-			Chat:        "{{.Input }}\n<start_of_turn>model\n",
-			ChatMessage: "<start_of_turn>{{if eq .RoleName \"assistant\" }}model{{else}}{{ .RoleName }}{{end}}\n{{ if .Content -}}\n{{.Content -}}\n{{ end -}}<end_of_turn>",
-			Completion:  "{{.Input}}",
-		},
-	},
-	DeepSeek2: {
-		StopWords: []string{"<｜end▁of▁sentence｜>"},
-		TemplateConfig: TemplateConfig{
-			ChatMessage: `{{if eq .RoleName "user" -}}User: {{.Content }}
-{{ end -}}
-{{if eq .RoleName "assistant" -}}Assistant: {{.Content}}<｜end▁of▁sentence｜>{{end}}
-{{if eq .RoleName "system" -}}{{.Content}}
-{{end -}}`,
-			Chat: "{{.Input -}}\nAssistant: ",
-		},
-	},
-	LLaMa3: {
-		StopWords: []string{"<|eot_id|>"},
-		TemplateConfig: TemplateConfig{
-			Chat:        "<|begin_of_text|>{{.Input }}\n<|start_header_id|>assistant<|end_header_id|>",
-			ChatMessage: "<|start_header_id|>{{ .RoleName }}<|end_header_id|>\n\n{{.Content }}<|eot_id|>",
-		},
-	},
-	CommandR: {
-		TemplateConfig: TemplateConfig{
-			Chat: "{{.Input -}}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
-			Functions: `<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
-You are a function calling AI model, you can call the following functions:
-## Available Tools
-{{range .Functions}}
-- {"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}
-{{end}}
-When using a tool, reply with JSON, for instance {"name": "tool_name", "arguments": {"param1": "value1", "param2": "value2"}}
-<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Input -}}`,
-			ChatMessage: `{{if eq .RoleName "user" -}}
-<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
-{{- else if eq .RoleName "system" -}}
-<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
-{{- else if eq .RoleName "assistant" -}}
-<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
-{{- else if eq .RoleName "tool" -}}
-<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
-{{- else if .FunctionCall -}}
-<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{toJson .FunctionCall}}}<|END_OF_TURN_TOKEN|>
-{{- end -}}`,
-		},
-		StopWords: []string{"<|END_OF_TURN_TOKEN|>"},
-	},
-	Phi3: {
-		TemplateConfig: TemplateConfig{
-			Chat:        "{{.Input}}\n<|assistant|>",
-			ChatMessage: "<|{{ .RoleName }}|>\n{{.Content}}<|end|>",
-			Completion:  "{{.Input}}",
-		},
-		StopWords: []string{"<|end|>", "<|endoftext|>"},
-	},
-	ChatML: {
-		TemplateConfig: TemplateConfig{
-			Chat: "{{.Input -}}\n<|im_start|>assistant",
-			Functions: `<|im_start|>system
-You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
-{{range .Functions}}
-{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
-{{end}}
-For each function call return a json object with function name and arguments
-<|im_end|>
-{{.Input -}}
-<|im_start|>assistant`,
-			ChatMessage: `<|im_start|>{{ .RoleName }}
-{{ if .FunctionCall -}}
-Function call:
-{{ else if eq .RoleName "tool" -}}
-Function response:
-{{ end -}}
-{{ if .Content -}}
-{{.Content }}
-{{ end -}}
-{{ if .FunctionCall -}}
-{{toJson .FunctionCall}}
-{{ end -}}<|im_end|>`,
-		},
-		StopWords: []string{"<|im_end|>", "<dummy32000>", "</s>"},
-	},
-	Mistral03: {
-		TemplateConfig: TemplateConfig{
-			Chat:      "{{.Input -}}",
-			Functions: `[AVAILABLE_TOOLS] [{{range .Functions}}{"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}{{end}} ] [/AVAILABLE_TOOLS]{{.Input }}`,
-			ChatMessage: `{{if eq .RoleName "user" -}}
-[INST] {{.Content }} [/INST]
-{{- else if .FunctionCall -}}
-[TOOL_CALLS] {{toJson .FunctionCall}} [/TOOL_CALLS]
-{{- else if eq .RoleName "tool" -}}
-[TOOL_RESULTS] {{.Content}} [/TOOL_RESULTS]
-{{- else -}}
-{{ .Content -}}
-{{ end -}}`,
-		},
-		StopWords: []string{"<|im_end|>", "<dummy32000>", "</tool_call>", "<|eot_id|>", "<|end_of_text|>", "</s>", "[/TOOL_CALLS]", "[/ACTIONS]"},
-	},
-}
-
-// this maps well known template used in HF to model families defined above
-var knownTemplates = map[string]familyType{
-	`{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}`:                              ChatML,
-	`{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}`: Mistral03,
-}
-
 func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
 
 	if defaultCtx == 0 && cfg.ContextSize == nil {
@@ -216,81 +82,9 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
 		cfg.Name = f.Metadata().Name
 	}
 
-	family := identifyFamily(f)
-
-	if family == Unknown {
-		log.Debug().Msgf("guessDefaultsFromFile: %s", "family not identified")
-		return
-	}
-
-	// identify template
-	settings, ok := defaultsSettings[family]
-	if ok {
-		cfg.TemplateConfig = settings.TemplateConfig
-		log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: guessed template %+v", cfg.TemplateConfig)
-		if len(cfg.StopWords) == 0 {
-			cfg.StopWords = settings.StopWords
-		}
-		if cfg.RepeatPenalty == 0.0 {
-			cfg.RepeatPenalty = settings.RepeatPenalty
-		}
-	} else {
-		log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: no template found for family")
-	}
-
-	if cfg.HasTemplate() {
-		return
-	}
-
-	// identify from well known templates first, otherwise use the raw jinja template
-	chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
-	if found {
-		// try to use the jinja template
-		cfg.TemplateConfig.JinjaTemplate = true
-		cfg.TemplateConfig.ChatMessage = chatTemplate.ValueString()
-	}
-
-}
-
-func identifyFamily(f *gguf.GGUFFile) familyType {
-
-	// identify from well known templates first
-	chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
-	if found && chatTemplate.ValueString() != "" {
-		if family, ok := knownTemplates[chatTemplate.ValueString()]; ok {
-			return family
-		}
-	}
-
-	// otherwise try to identify from the model properties
-	arch := f.Architecture().Architecture
-	eosTokenID := f.Tokenizer().EOSTokenID
-	bosTokenID := f.Tokenizer().BOSTokenID
-
-	isYI := arch == "llama" && bosTokenID == 1 && eosTokenID == 2
-	// WTF! Mistral0.3 and isYi have same bosTokenID and eosTokenID
-
-	llama3 := arch == "llama" && eosTokenID == 128009
-	commandR := arch == "command-r" && eosTokenID == 255001
-	qwen2 := arch == "qwen2"
-	phi3 := arch == "phi-3"
-	gemma := strings.HasPrefix(arch, "gemma") || strings.Contains(strings.ToLower(f.Metadata().Name), "gemma")
-	deepseek2 := arch == "deepseek2"
-
-	switch {
-	case deepseek2:
-		return DeepSeek2
-	case gemma:
-		return Gemma
-	case llama3:
-		return LLaMa3
-	case commandR:
-		return CommandR
-	case phi3:
-		return Phi3
-	case qwen2, isYI:
-		return ChatML
-	default:
-		return Unknown
-	}
+	// Instruct to use template from llama.cpp
+	cfg.TemplateConfig.UseTokenizerTemplate = true
+	cfg.FunctionsConfig.GrammarConfig.NoGrammar = true
+	cfg.Options = append(cfg.Options, "use_jinja:true")
+	cfg.KnownUsecaseStrings = append(cfg.KnownUsecaseStrings, "FLAG_CHAT")
 }
@@ -265,19 +265,10 @@ type TemplateConfig struct {
 
 	Multimodal string `yaml:"multimodal" json:"multimodal"`
 
-	JinjaTemplate bool `yaml:"jinja_template" json:"jinja_template"`
-
 	ReplyPrefix string `yaml:"reply_prefix" json:"reply_prefix"`
 }
 
-func (c *ModelConfig) UnmarshalYAML(value *yaml.Node) error {
-	type BCAlias ModelConfig
-	var aux BCAlias
-	if err := value.Decode(&aux); err != nil {
-		return err
-	}
-	*c = ModelConfig(aux)
-
+func (c *ModelConfig) syncKnownUsecasesFromString() {
 	c.KnownUsecases = GetUsecasesFromYAML(c.KnownUsecaseStrings)
 	// Make sure the usecases are valid, we rewrite with what we identified
 	c.KnownUsecaseStrings = []string{}
@@ -286,6 +277,17 @@ func (c *ModelConfig) UnmarshalYAML(value *yaml.Node) error {
 			c.KnownUsecaseStrings = append(c.KnownUsecaseStrings, k)
 		}
 	}
+}
+
+func (c *ModelConfig) UnmarshalYAML(value *yaml.Node) error {
+	type BCAlias ModelConfig
+	var aux BCAlias
+	if err := value.Decode(&aux); err != nil {
+		return err
+	}
+	*c = ModelConfig(aux)
+
+	c.syncKnownUsecasesFromString()
 	return nil
 }
 
@@ -462,6 +464,7 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
 	}
 
 	guessDefaultsFromFile(cfg, lo.modelPath, ctx)
+	cfg.syncKnownUsecasesFromString()
 }
 
 func (c *ModelConfig) Validate() bool {
@@ -492,7 +495,7 @@ func (c *ModelConfig) Validate() bool {
 }
 
 func (c *ModelConfig) HasTemplate() bool {
-	return c.TemplateConfig.Completion != "" || c.TemplateConfig.Edit != "" || c.TemplateConfig.Chat != "" || c.TemplateConfig.ChatMessage != ""
+	return c.TemplateConfig.Completion != "" || c.TemplateConfig.Edit != "" || c.TemplateConfig.Chat != "" || c.TemplateConfig.ChatMessage != "" || c.TemplateConfig.UseTokenizerTemplate
 }
 
 func (c *ModelConfig) GetModelConfigFile() string {
@@ -573,7 +576,7 @@ func (c *ModelConfig) HasUsecases(u ModelConfigUsecases) bool {
 // This avoids the maintenance burden of updating this list for each new backend - but unfortunately, that's the best option for some services currently.
 func (c *ModelConfig) GuessUsecases(u ModelConfigUsecases) bool {
 	if (u & FLAG_CHAT) == FLAG_CHAT {
-		if c.TemplateConfig.Chat == "" && c.TemplateConfig.ChatMessage == "" {
+		if c.TemplateConfig.Chat == "" && c.TemplateConfig.ChatMessage == "" && !c.TemplateConfig.UseTokenizerTemplate {
 			return false
 		}
 	}
Original file line number	Diff line number	Diff line change
`@@ -265,19 +265,10 @@ type TemplateConfig struct {`
`265`	`265`
`266`	`266`	Multimodal string `yaml:"multimodal" json:"multimodal"`
`267`	`267`
`268`		- JinjaTemplate bool `yaml:"jinja_template" json:"jinja_template"`
`269`		`-`
`270`	`268`	ReplyPrefix string `yaml:"reply_prefix" json:"reply_prefix"`
`271`	`269`	`}`
`272`	`270`
`273`		`-func (c ModelConfig) UnmarshalYAML(value yaml.Node) error {`
`274`		`- type BCAlias ModelConfig`
`275`		`- var aux BCAlias`
`276`		`- if err := value.Decode(&aux); err != nil {`
`277`		`- return err`
`278`		`- }`
`279`		`- *c = ModelConfig(aux)`
`280`		`-`
	`271`	`+func (c *ModelConfig) syncKnownUsecasesFromString() {`
`281`	`272`	`c.KnownUsecases = GetUsecasesFromYAML(c.KnownUsecaseStrings)`
`282`	`273`	`// Make sure the usecases are valid, we rewrite with what we identified`
`283`	`274`	`c.KnownUsecaseStrings = []string{}`
`@@ -286,6 +277,17 @@ func (c ModelConfig) UnmarshalYAML(value yaml.Node) error {`
`286`	`277`	`c.KnownUsecaseStrings = append(c.KnownUsecaseStrings, k)`
`287`	`278`	`}`
`288`	`279`	`}`
	`280`	`+}`
	`281`	`+`
	`282`	`+func (c ModelConfig) UnmarshalYAML(value yaml.Node) error {`
	`283`	`+ type BCAlias ModelConfig`
	`284`	`+ var aux BCAlias`
	`285`	`+ if err := value.Decode(&aux); err != nil {`
	`286`	`+ return err`
	`287`	`+ }`
	`288`	`+ *c = ModelConfig(aux)`
	`289`	`+`
	`290`	`+ c.syncKnownUsecasesFromString()`
`289`	`291`	`return nil`
`290`	`292`	`}`
`291`	`293`
`@@ -462,6 +464,7 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {`
`462`	`464`	`}`
`463`	`465`
`464`	`466`	`guessDefaultsFromFile(cfg, lo.modelPath, ctx)`
	`467`	`+ cfg.syncKnownUsecasesFromString()`
`465`	`468`	`}`
`466`	`469`
`467`	`470`	`func (c *ModelConfig) Validate() bool {`
`@@ -492,7 +495,7 @@ func (c *ModelConfig) Validate() bool {`
`492`	`495`	`}`
`493`	`496`
`494`	`497`	`func (c *ModelConfig) HasTemplate() bool {`
`495`		`- return c.TemplateConfig.Completion != "" \|\| c.TemplateConfig.Edit != "" \|\| c.TemplateConfig.Chat != "" \|\| c.TemplateConfig.ChatMessage != ""`
	`498`	`+ return c.TemplateConfig.Completion != "" \|\| c.TemplateConfig.Edit != "" \|\| c.TemplateConfig.Chat != "" \|\| c.TemplateConfig.ChatMessage != "" \|\| c.TemplateConfig.UseTokenizerTemplate`
`496`	`499`	`}`
`497`	`500`
`498`	`501`	`func (c *ModelConfig) GetModelConfigFile() string {`
`@@ -573,7 +576,7 @@ func (c *ModelConfig) HasUsecases(u ModelConfigUsecases) bool {`
`573`	`576`	`// This avoids the maintenance burden of updating this list for each new backend - but unfortunately, that's the best option for some services currently.`
`574`	`577`	`func (c *ModelConfig) GuessUsecases(u ModelConfigUsecases) bool {`
`575`	`578`	`if (u & FLAG_CHAT) == FLAG_CHAT {`
`576`		`- if c.TemplateConfig.Chat == "" && c.TemplateConfig.ChatMessage == "" {`
	`579`	`+ if c.TemplateConfig.Chat == "" && c.TemplateConfig.ChatMessage == "" && !c.TemplateConfig.UseTokenizerTemplate {`
`577`	`580`	`return false`
`578`	`581`	`}`
`579`	`582`	`}`