Skip to content

Commit 9008027

Browse files
authored
hparams : add n_embd_inp() to support extended embed (#16928)
* add n_embd_full to support extended embed * don't change output * rename to n_embd_inp * restore n_embd where applicable
1 parent 16bcc12 commit 9008027

File tree

9 files changed

+29
-28
lines changed

9 files changed

+29
-28
lines changed

include/llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -486,6 +486,7 @@ extern "C" {
486486

487487
LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model);
488488
LLAMA_API int32_t llama_model_n_embd (const struct llama_model * model);
489+
LLAMA_API int32_t llama_model_n_embd_inp (const struct llama_model * model);
489490
LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
490491
LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
491492
LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model);

src/llama-context.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -827,7 +827,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
827827

828828
const auto & hparams = model.hparams;
829829

830-
const int64_t n_embd = hparams.n_embd;
830+
const int64_t n_embd = hparams.n_embd_inp();
831831
const int64_t n_vocab = model.vocab.n_tokens();
832832

833833
// note: during encode, we always pass the full sequence starting from pos = 0
@@ -996,7 +996,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
996996
const auto & hparams = model.hparams;
997997

998998
const int64_t n_vocab = vocab.n_tokens();
999-
const int64_t n_embd = hparams.n_embd;
999+
const int64_t n_embd = hparams.n_embd_inp();
10001000

10011001
// when computing embeddings, all tokens are output
10021002
const bool output_all = cparams.embeddings;
@@ -2154,7 +2154,7 @@ void llama_context::opt_epoch_iter(
21542154
batch.logits [pos_batch] = true;
21552155
}
21562156

2157-
if (!balloc->init(batch, model.vocab, nullptr, model.hparams.n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) {
2157+
if (!balloc->init(batch, model.vocab, nullptr, model.hparams.n_embd_inp(), cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) {
21582158
LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
21592159
return;
21602160
}

src/llama-graph.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1142,7 +1142,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
11421142

11431143
// input embeddings with optional lora
11441144
ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
1145-
const int64_t n_embd = hparams.n_embd;
1145+
const int64_t n_embd = hparams.n_embd_inp();
11461146

11471147
auto inp = std::make_unique<llm_graph_input_embd>();
11481148

@@ -1279,7 +1279,7 @@ ggml_tensor * llm_graph_context::build_inp_cross_embd() const {
12791279
// return cur;
12801280
//}
12811281

1282-
const auto n_embd = !cross->v_embd.empty() ? cross->n_embd : hparams.n_embd;
1282+
const auto n_embd = !cross->v_embd.empty() ? cross->n_embd : hparams.n_embd_inp();
12831283
const auto n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;
12841284

12851285
cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc);

src/llama-hparams.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,16 @@ uint32_t llama_hparams::n_gqa(uint32_t il) const {
6060
return n_head/n_head_kv;
6161
}
6262

63+
uint32_t llama_hparams::n_embd_inp() const {
64+
uint32_t n_embd_inp = n_embd;
65+
66+
if (n_deepstack_layers > 0) {
67+
n_embd_inp += n_embd * n_deepstack_layers;
68+
}
69+
70+
return n_embd_inp;
71+
}
72+
6373
uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const {
6474
const uint32_t n_head_kv = this->n_head_kv(il);
6575

src/llama-hparams.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,9 @@ struct llama_hparams {
227227

228228
uint32_t n_gqa(uint32_t il = 0) const;
229229

230+
// dimension of main + auxiliary input embeddings
231+
uint32_t n_embd_inp() const;
232+
230233
// dimension of key embeddings across all k-v heads
231234
uint32_t n_embd_k_gqa(uint32_t il = 0) const;
232235

src/llama-model.cpp

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -276,8 +276,8 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
276276
} break;
277277
case GGML_OP_IM2COL:
278278
{
279-
const int n_embd = hparams.n_embd;
280-
ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
279+
const int n_embd_inp = hparams.n_embd_inp();
280+
ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd_inp, w->ne[1], 1, 1);
281281
op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
282282
} break;
283283
case GGML_OP_SCALE:
@@ -1039,9 +1039,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
10391039
case 64: type = LLM_TYPE_32B; break;
10401040
default: type = LLM_TYPE_UNKNOWN;
10411041
}
1042-
// since vision model stacks deepstack features along feature dim
1043-
// we also create a fake "n_embd" for text model to be the main embd + deepstack embds
1044-
hparams.n_embd *= hparams.n_deepstack_layers + 1;
10451042
} break;
10461043
case LLM_ARCH_QWEN3MOE:
10471044
{
@@ -1065,9 +1062,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
10651062
case 94: type = LLM_TYPE_235B_A22B; break;
10661063
default: type = LLM_TYPE_UNKNOWN;
10671064
}
1068-
// since vision model stacks deepstack features along feature dim
1069-
// we also create a fake "n_embd" for text model to be the main embd + deepstack embds
1070-
hparams.n_embd *= hparams.n_deepstack_layers + 1;
10711065
} break;
10721066
case LLM_ARCH_PHI2:
10731067
{
@@ -3341,10 +3335,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
33413335
case LLM_ARCH_QWEN3:
33423336
case LLM_ARCH_QWEN3VL:
33433337
{
3344-
// for model loading, the weights only have the main embd
3345-
// so we need to divide by the number of deepstack layers + 1
3346-
// n_embd is const int so we declare a new variable
3347-
int64_t n_embd = hparams.n_embd / (hparams.n_deepstack_layers + 1);
33483338
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
33493339

33503340
// output
@@ -3380,10 +3370,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
33803370
case LLM_ARCH_QWEN3MOE:
33813371
case LLM_ARCH_QWEN3VLMOE:
33823372
{
3383-
// for model loading, the weights only have the main embd
3384-
// so we need to divide by the number of deepstack layers + 1
3385-
// n_embd is const int so we declare a new variable
3386-
int64_t n_embd = hparams.n_embd / (hparams.n_deepstack_layers + 1);
33873373
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
33883374

33893375
// output
@@ -6535,6 +6521,7 @@ void llama_model::print_info() const {
65356521
if (!hparams.vocab_only) {
65366522
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
65376523
LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
6524+
LLAMA_LOG_INFO("%s: n_embd_inp = %u\n", __func__, hparams.n_embd_inp());
65386525
LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
65396526
LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
65406527
LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
@@ -7380,6 +7367,10 @@ int32_t llama_model_n_embd(const llama_model * model) {
73807367
return model->hparams.n_embd;
73817368
}
73827369

7370+
int32_t llama_model_n_embd_inp(const llama_model * model) {
7371+
return model->hparams.n_embd_inp();
7372+
}
7373+
73837374
int32_t llama_model_n_layer(const llama_model * model) {
73847375
return model->hparams.n_layer;
73857376
}

src/models/qwen3vl-moe.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
11
#include "models.h"
22

33
llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
4-
const int64_t n_embd_full = hparams.n_embd; // main embd + deepstack embds
54
const size_t n_deepstack_layers = hparams.n_deepstack_layers;
6-
const int64_t n_embd = n_embd_full / (n_deepstack_layers + 1);
5+
const int64_t n_embd = hparams.n_embd;
76
const int64_t n_embd_head = hparams.n_embd_head_v;
87

98
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);

src/models/qwen3vl.cpp

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,10 @@
11
#include "models.h"
22

33
llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
4-
5-
const int64_t n_embd_full = hparams.n_embd; // main embd + deepstack embds
64
const size_t n_deepstack_layers = hparams.n_deepstack_layers;
7-
const int64_t n_embd = n_embd_full / (n_deepstack_layers + 1);
5+
const int64_t n_embd = hparams.n_embd;
86
const int64_t n_embd_head = hparams.n_embd_head_v;
97

10-
118
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
129
GGML_ASSERT(n_embd_head == hparams.n_rot);
1310

tools/mtmd/mtmd.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ struct mtmd_context {
163163
print_timings(ctx_params.print_timings),
164164
n_threads (ctx_params.n_threads),
165165
media_marker (ctx_params.media_marker),
166-
n_embd_text (llama_model_n_embd(text_model))
166+
n_embd_text (llama_model_n_embd_inp(text_model))
167167
{
168168
if (std::string(ctx_params.image_marker) != MTMD_DEFAULT_IMAGE_MARKER) {
169169
throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead");

0 commit comments

Comments
 (0)