@@ -10189,7 +10189,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
1018910189 }
1019010190};
1019110191
10192- struct llm_graph_context_mamba : public virtual llm_graph_context {
10192+ struct llm_graph_context_mamba : public llm_graph_context {
1019310193 llm_graph_context_mamba(const llm_graph_params & params) : llm_graph_context(params) {}
1019410194
1019510195 ggml_tensor * build_mamba_layer(
@@ -10466,8 +10466,7 @@ struct llm_graph_context_mamba : public virtual llm_graph_context {
1046610466};
1046710467
1046810468struct llm_build_mamba : public llm_graph_context_mamba {
10469- llm_build_mamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf)
10470- : llm_graph_context(params), llm_graph_context_mamba(params) {
10469+ llm_build_mamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context_mamba(params) {
1047110470 ggml_tensor * cur;
1047210471 ggml_tensor * inpL;
1047310472
@@ -10524,8 +10523,7 @@ struct llm_build_mamba : public llm_graph_context_mamba {
1052410523};
1052510524
1052610525struct llm_build_jamba : public llm_graph_context_mamba {
10527- llm_build_jamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf)
10528- : llm_graph_context(params), llm_graph_context_mamba(params) {
10526+ llm_build_jamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context_mamba(params) {
1052910527 const int64_t n_embd_head = hparams.n_embd_head_v;
1053010528
1053110529 ggml_tensor * cur;
@@ -13958,8 +13956,78 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
1395813956 }
1395913957};
1396013958
13961- struct llm_graph_context_granite : public virtual llm_graph_context {
13962- llm_graph_context_granite(const llm_graph_params & params) : llm_graph_context(params) {}
13959+ struct llm_build_granite : public llm_graph_context {
13960+ llm_build_granite(
13961+ const llama_model & model,
13962+ const llm_graph_params & params,
13963+ ggml_cgraph * gf)
13964+ : llm_graph_context(params) {
13965+
13966+ const int64_t n_embd_head = hparams.n_embd_head_v;
13967+
13968+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
13969+ GGML_ASSERT(n_embd_head == hparams.n_rot);
13970+
13971+ ggml_tensor * cur;
13972+ ggml_tensor * inpL;
13973+
13974+ inpL = build_inp_embd(model.tok_embd);
13975+
13976+ // inp_pos - built only if rope enabled
13977+ ggml_tensor * inp_pos = nullptr;
13978+ if (hparams.rope_finetuned) {
13979+ inp_pos = build_inp_pos();
13980+ }
13981+
13982+ auto * inp_attn = build_attn_inp_kv_unified();
13983+
13984+ ggml_tensor * inp_out_ids = build_inp_out_ids();
13985+
13986+ for (int il = 0; il < n_layer; ++il) {
13987+ ggml_tensor * inpSA = inpL;
13988+
13989+ // norm
13990+ cur = build_norm(inpL,
13991+ model.layers[il].attn_norm, NULL,
13992+ LLM_NORM_RMS, il);
13993+ cb(cur, "attn_norm", il);
13994+
13995+ // self-attention
13996+ cur = build_attention_layer(
13997+ gf, cur, inp_pos, inp_attn,
13998+ model, n_embd_head, il);
13999+
14000+ if (il == n_layer - 1 && inp_out_ids) {
14001+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
14002+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
14003+ }
14004+
14005+ // ffn
14006+ cur = build_layer_ffn(cur, inpSA, model, il);
14007+
14008+ // input for next layer
14009+ inpL = cur;
14010+ }
14011+
14012+ cur = inpL;
14013+
14014+ cur = build_norm(cur,
14015+ model.output_norm, NULL,
14016+ LLM_NORM_RMS, -1);
14017+
14018+ cb(cur, "result_norm", -1);
14019+ res->t_embd = cur;
14020+
14021+ // lm_head
14022+ cur = build_lora_mm(model.output, cur);
14023+
14024+ // For Granite architectures - scale logits
14025+ cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
14026+ cb(cur, "result_output", -1);
14027+ res->t_logits = cur;
14028+
14029+ ggml_build_forward_expand(gf, cur);
14030+ }
1396314031
1396414032 ggml_tensor * build_attention_layer(
1396514033 ggml_cgraph * gf,
@@ -14104,89 +14172,13 @@ struct llm_graph_context_granite : public virtual llm_graph_context {
1410414172 }
1410514173};
1410614174
14107- struct llm_build_granite : public llm_graph_context_granite {
14108- llm_build_granite(
14109- const llama_model & model,
14110- const llm_graph_params & params,
14111- ggml_cgraph * gf)
14112- : llm_graph_context(params), llm_graph_context_granite(params) {
14113-
14114- const int64_t n_embd_head = hparams.n_embd_head_v;
14115-
14116- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
14117- GGML_ASSERT(n_embd_head == hparams.n_rot);
14118-
14119- ggml_tensor * cur;
14120- ggml_tensor * inpL;
14121-
14122- inpL = build_inp_embd(model.tok_embd);
14123-
14124- // inp_pos - built only if rope enabled
14125- ggml_tensor * inp_pos = nullptr;
14126- if (hparams.rope_finetuned) {
14127- inp_pos = build_inp_pos();
14128- }
14129-
14130- auto * inp_attn = build_attn_inp_kv_unified();
14131-
14132- ggml_tensor * inp_out_ids = build_inp_out_ids();
14133-
14134- for (int il = 0; il < n_layer; ++il) {
14135- ggml_tensor * inpSA = inpL;
14136-
14137- // norm
14138- cur = build_norm(inpL,
14139- model.layers[il].attn_norm, NULL,
14140- LLM_NORM_RMS, il);
14141- cb(cur, "attn_norm", il);
14142-
14143- // self-attention
14144- cur = build_attention_layer(
14145- gf, cur, inp_pos, inp_attn,
14146- model, n_embd_head, il);
14147-
14148- if (il == n_layer - 1 && inp_out_ids) {
14149- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
14150- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
14151- }
14152-
14153- // ffn
14154- cur = build_layer_ffn(cur, inpSA, model, il);
14155-
14156- // input for next layer
14157- inpL = cur;
14158- }
14159-
14160- cur = inpL;
14161-
14162- cur = build_norm(cur,
14163- model.output_norm, NULL,
14164- LLM_NORM_RMS, -1);
14165-
14166- cb(cur, "result_norm", -1);
14167- res->t_embd = cur;
14168-
14169- // lm_head
14170- cur = build_lora_mm(model.output, cur);
14171-
14172- // For Granite architectures - scale logits
14173- cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
14174- cb(cur, "result_output", -1);
14175- res->t_logits = cur;
14176-
14177- ggml_build_forward_expand(gf, cur);
14178- }
14179- };
14180-
14181- struct llm_build_granite_hybrid : public llm_graph_context_mamba, public llm_graph_context_granite {
14175+ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
1418214176
1418314177 llm_build_granite_hybrid(
1418414178 const llama_model & model,
1418514179 const llm_graph_params & params,
1418614180 ggml_cgraph * gf) :
14187- llm_graph_context(params),
14188- llm_graph_context_mamba(params),
14189- llm_graph_context_granite(params) {
14181+ llm_graph_context_mamba(params) {
1419014182
1419114183 const int64_t n_embd_head = hparams.n_embd_head_v;
1419214184 GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -14258,6 +14250,148 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba, public llm_gra
1425814250
1425914251 ggml_build_forward_expand(gf, cur);
1426014252 }
14253+
14254+ ggml_tensor * build_attention_layer(
14255+ ggml_cgraph * gf,
14256+ ggml_tensor * cur,
14257+ ggml_tensor * inp_pos,
14258+ llm_graph_input_attn_kv_unified * inp_attn,
14259+ const llama_model & model,
14260+ const int64_t n_embd_head,
14261+ const int il) {
14262+
14263+ // compute Q and K and (optionally) RoPE them
14264+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
14265+ cb(Qcur, "Qcur", il);
14266+ if (model.layers[il].bq) {
14267+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
14268+ cb(Qcur, "Qcur", il);
14269+ }
14270+
14271+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
14272+ cb(Kcur, "Kcur", il);
14273+ if (model.layers[il].bk) {
14274+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
14275+ cb(Kcur, "Kcur", il);
14276+ }
14277+
14278+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
14279+ cb(Vcur, "Vcur", il);
14280+ if (model.layers[il].bv) {
14281+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
14282+ cb(Vcur, "Vcur", il);
14283+ }
14284+
14285+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens);
14286+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
14287+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
14288+
14289+ const bool use_rope = hparams.rope_finetuned;
14290+ if (use_rope) {
14291+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
14292+ Qcur = ggml_rope_ext(
14293+ ctx0, Qcur, inp_pos, rope_factors,
14294+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14295+ ext_factor, attn_factor, beta_fast, beta_slow
14296+ );
14297+
14298+ Kcur = ggml_rope_ext(
14299+ ctx0, Kcur, inp_pos, rope_factors,
14300+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14301+ ext_factor, attn_factor, beta_fast, beta_slow
14302+ );
14303+ }
14304+
14305+ cb(Qcur, "Qcur", il);
14306+ cb(Kcur, "Kcur", il);
14307+ cb(Vcur, "Vcur", il);
14308+
14309+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
14310+ cur = build_attn(inp_attn, gf,
14311+ model.layers[il].wo, model.layers[il].bo,
14312+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
14313+ cb(cur, "attn_out", il);
14314+ return cur;
14315+ }
14316+
14317+ ggml_tensor * build_layer_ffn(
14318+ ggml_tensor * cur,
14319+ ggml_tensor * inpSA,
14320+ const llama_model & model,
14321+ const int il) {
14322+
14323+ // For Granite architectures - scale residual
14324+ if (hparams.f_residual_scale) {
14325+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
14326+ }
14327+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
14328+ cb(ffn_inp, "ffn_inp", il);
14329+
14330+ // feed-forward network (non-MoE)
14331+ if (model.layers[il].ffn_gate_inp == nullptr) {
14332+
14333+ cur = build_norm(ffn_inp,
14334+ model.layers[il].ffn_norm, NULL,
14335+ LLM_NORM_RMS, il);
14336+ cb(cur, "ffn_norm", il);
14337+
14338+ cur = build_ffn(cur,
14339+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
14340+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
14341+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
14342+ NULL,
14343+ LLM_FFN_SILU, LLM_FFN_PAR, il);
14344+ cb(cur, "ffn_out", il);
14345+
14346+ } else {
14347+ // MoE branch
14348+ cur = build_norm(ffn_inp,
14349+ model.layers[il].ffn_norm, NULL,
14350+ LLM_NORM_RMS, il);
14351+ cb(cur, "ffn_norm", il);
14352+
14353+ ggml_tensor * moe_out = build_moe_ffn(cur,
14354+ model.layers[il].ffn_gate_inp,
14355+ model.layers[il].ffn_up_exps,
14356+ model.layers[il].ffn_gate_exps,
14357+ model.layers[il].ffn_down_exps,
14358+ nullptr,
14359+ n_expert, n_expert_used,
14360+ LLM_FFN_SILU, true,
14361+ false, 0.0,
14362+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
14363+ il);
14364+ cb(moe_out, "ffn_moe_out", il);
14365+
14366+ // For Granite MoE Shared
14367+ if (hparams.n_ff_shexp > 0) {
14368+ ggml_tensor * ffn_shexp = build_ffn(cur,
14369+ model.layers[il].ffn_up_shexp, NULL, NULL,
14370+ model.layers[il].ffn_gate_shexp, NULL, NULL,
14371+ model.layers[il].ffn_down_shexp, NULL, NULL,
14372+ NULL,
14373+ LLM_FFN_SILU, LLM_FFN_PAR, il);
14374+ cb(ffn_shexp, "ffn_shexp", il);
14375+
14376+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
14377+ cb(cur, "ffn_out", il);
14378+ } else {
14379+ cur = moe_out;
14380+ }
14381+ }
14382+
14383+ // For Granite architectures - scale residual
14384+ if (hparams.f_residual_scale) {
14385+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
14386+ }
14387+ cur = ggml_add(ctx0, cur, ffn_inp);
14388+ cb(cur, "ffn_out", il);
14389+
14390+ cur = build_cvec(cur, il);
14391+ cb(cur, "l_out", il);
14392+
14393+ return cur;
14394+ }
1426114395};
1426214396
1426314397// ref: https://github.com/facebookresearch/chameleon
@@ -15192,8 +15326,7 @@ struct llm_build_ernie4_5 : public llm_graph_context {
1519215326};
1519315327
1519415328struct llm_build_falcon_h1 : public llm_graph_context_mamba {
15195- llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf)
15196- : llm_graph_context(params), llm_graph_context_mamba(params) {
15329+ llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context_mamba(params) {
1519715330 const int64_t n_embd_head = hparams.n_embd_head_v;
1519815331
1519915332 ggml_tensor * cur;
0 commit comments