@@ -2,7 +2,6 @@ package llama
22
33import (
44 "cmp"
5- "fmt"
65 "math"
76
87 "github.com/ollama/ollama/fs"
@@ -23,51 +22,60 @@ type Options struct {
2322
2423type Model struct {
2524 model.Base
26- model.BytePairEncoding
25+ model.TextProcessor
2726
2827 TokenEmbedding * nn.Embedding `gguf:"token_embd"`
2928 Layers []Layer `gguf:"blk"`
3029 OutputNorm * nn.RMSNorm `gguf:"output_norm"`
3130 Output * nn.Linear `gguf:"output,alt:token_embd"`
3231
33- * Options
32+ Options
3433}
3534
3635func New (c fs.Config ) (model.Model , error ) {
37- // This model currently only supports the gpt2 tokenizer
38- if c . String ( "tokenizer.ggml.model" ) == "llama" {
39- return nil , fmt . Errorf ( "unsupported tokenizer: llama" )
36+ if c . Uint ( "expert_count" ) > 0 {
37+ // TODO: support mixtures of experts
38+ return nil , model . ErrUnsupportedModel
4039 }
41- // Best effort detection of library/deepseek-coder model(s) which are incompatible
42- if c .String ("general.name" ) == "deepseek-ai" {
43- return nil , fmt .Errorf ("unsupported model: %s" , c .String ("general.name" ))
40+
41+ var processor model.TextProcessor
42+ vocabulary := model.Vocabulary {
43+ Values : c .Strings ("tokenizer.ggml.tokens" ),
44+ Scores : c .Floats ("tokenizer.ggml.scores" ),
45+ Types : c .Ints ("tokenizer.ggml.token_type" ),
46+ Merges : c .Strings ("tokenizer.ggml.merges" ),
47+ AddBOS : c .Bool ("tokenizer.ggml.add_bos_token" , true ),
48+ BOS : []int32 {int32 (c .Uint ("tokenizer.ggml.bos_token_id" ))},
49+ AddEOS : c .Bool ("tokenizer.ggml.add_eos_token" , false ),
50+ EOS : append (
51+ []int32 {int32 (c .Uint ("tokenizer.ggml.eos_token_id" ))},
52+ c .Ints ("tokenizer.ggml.eos_token_ids" )... ,
53+ ),
54+ }
55+ switch c .String ("tokenizer.ggml.model" ) {
56+ case "gpt2" :
57+ processor = model .NewBytePairEncoding (
58+ `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+` ,
59+ & vocabulary ,
60+ )
61+ case "llama" :
62+ processor = model .NewSentencePiece (& vocabulary )
63+ default :
64+ return nil , model .ErrUnsupportedTokenizer
4465 }
66+
4567 m := Model {
46- BytePairEncoding : model .NewBytePairEncoding (
47- c .String ("tokenizer.ggml.pretokenizer" , `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+` ),
48- & model.Vocabulary {
49- Values : c .Strings ("tokenizer.ggml.tokens" ),
50- Types : c .Ints ("tokenizer.ggml.token_type" ),
51- Merges : c .Strings ("tokenizer.ggml.merges" ),
52- AddBOS : c .Bool ("tokenizer.ggml.add_bos_token" , true ),
53- BOS : []int32 {int32 (c .Uint ("tokenizer.ggml.bos_token_id" ))},
54- AddEOS : c .Bool ("tokenizer.ggml.add_eos_token" , false ),
55- EOS : append (
56- []int32 {int32 (c .Uint ("tokenizer.ggml.eos_token_id" ))},
57- c .Ints ("tokenizer.ggml.eos_token_ids" )... ,
58- ),
59- },
60- ),
61- Layers : make ([]Layer , c .Uint ("block_count" )),
62- Options : & Options {
68+ TextProcessor : processor ,
69+ Layers : make ([]Layer , c .Uint ("block_count" )),
70+ Options : Options {
6371 hiddenSize : int (c .Uint ("embedding_length" )),
6472 numHeads : int (c .Uint ("attention.head_count" )),
6573 numKVHeads : int (c .Uint ("attention.head_count_kv" )),
6674 headDim : int (c .Uint ("attention.key_length" )),
6775 ropeDim : int (c .Uint ("rope.dimension_count" )),
6876 eps : c .Float ("attention.layer_norm_rms_epsilon" ),
69- ropeBase : c .Float ("rope.freq_base" ),
70- ropeScale : c .Float ("rope.freq_scale " , 1 ),
77+ ropeBase : c .Float ("rope.freq_base" , 1e5 ),
78+ ropeScale : c .Float ("rope.scaling.factor " , 1 ),
7179 },
7280 }
7381
@@ -98,8 +106,8 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.Tenso
98106 value := sa .Value .Forward (ctx , hiddenState )
99107 value = value .Reshape (ctx , headDim , opts .numKVHeads , batchSize )
100108
101- query = fast .RoPE (ctx , query , positions , ropeDim , opts .ropeBase , opts .ropeScale , rope .WithFactors (sa .RopeFactors ))
102- key = fast .RoPE (ctx , key , positions , ropeDim , opts .ropeBase , opts .ropeScale , rope .WithFactors (sa .RopeFactors ))
109+ query = fast .RoPE (ctx , query , positions , ropeDim , opts .ropeBase , 1. / opts .ropeScale , rope .WithFactors (sa .RopeFactors ))
110+ key = fast .RoPE (ctx , key , positions , ropeDim , opts .ropeBase , 1. / opts .ropeScale , rope .WithFactors (sa .RopeFactors ))
103111
104112 attention := nn .Attention (ctx , query , key , value , 1.0 / math .Sqrt (float64 (headDim )), cache )
105113 attention = attention .Reshape (ctx , headDim * opts .numHeads , batchSize )
@@ -108,7 +116,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.Tenso
108116
109117func (m * Model ) Shift (ctx ml.Context , layer int , key , shift ml.Tensor ) (ml.Tensor , error ) {
110118 ropeDim := cmp .Or (m .ropeDim , m .hiddenSize / m .numHeads )
111- return fast .RoPE (ctx , key , shift , ropeDim , m .ropeBase , m .ropeScale , rope .WithFactors (m .Layers [layer ].SelfAttention .RopeFactors )), nil
119+ return fast .RoPE (ctx , key , shift , ropeDim , m .ropeBase , 1. / m .ropeScale , rope .WithFactors (m .Layers [layer ].SelfAttention .RopeFactors )), nil
112120}
113121
114122type MLP struct {
@@ -163,7 +171,7 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
163171 outputs = batch .Outputs
164172 }
165173
166- hiddenState = layer .Forward (ctx , hiddenState , positions , outputs , m .Cache , m .Options )
174+ hiddenState = layer .Forward (ctx , hiddenState , positions , outputs , m .Cache , & m .Options )
167175 }
168176
169177 hiddenState = m .OutputNorm .Forward (ctx , hiddenState , m .eps )
0 commit comments