@@ -276,8 +276,8 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
276276 } break ;
277277 case GGML_OP_IM2COL:
278278 {
279- const int n_embd = hparams.n_embd ;
280- ggml_tensor * b = ggml_new_tensor_4d (ctx, GGML_TYPE_F32, n_embd , w->ne [1 ], 1 , 1 );
279+ const int n_embd_inp = hparams.n_embd_inp () ;
280+ ggml_tensor * b = ggml_new_tensor_4d (ctx, GGML_TYPE_F32, n_embd_inp , w->ne [1 ], 1 , 1 );
281281 op_tensor = ggml_im2col (ctx, w, b, 1 , 0 , 0 , 0 , 1 , 0 , false , GGML_TYPE_F16);
282282 } break ;
283283 case GGML_OP_SCALE:
@@ -1039,9 +1039,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
10391039 case 64 : type = LLM_TYPE_32B; break ;
10401040 default : type = LLM_TYPE_UNKNOWN;
10411041 }
1042- // since vision model stacks deepstack features along feature dim
1043- // we also create a fake "n_embd" for text model to be the main embd + deepstack embds
1044- hparams.n_embd *= hparams.n_deepstack_layers + 1 ;
10451042 } break ;
10461043 case LLM_ARCH_QWEN3MOE:
10471044 {
@@ -1065,9 +1062,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
10651062 case 94 : type = LLM_TYPE_235B_A22B; break ;
10661063 default : type = LLM_TYPE_UNKNOWN;
10671064 }
1068- // since vision model stacks deepstack features along feature dim
1069- // we also create a fake "n_embd" for text model to be the main embd + deepstack embds
1070- hparams.n_embd *= hparams.n_deepstack_layers + 1 ;
10711065 } break ;
10721066 case LLM_ARCH_PHI2:
10731067 {
@@ -3341,10 +3335,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
33413335 case LLM_ARCH_QWEN3:
33423336 case LLM_ARCH_QWEN3VL:
33433337 {
3344- // for model loading, the weights only have the main embd
3345- // so we need to divide by the number of deepstack layers + 1
3346- // n_embd is const int so we declare a new variable
3347- int64_t n_embd = hparams.n_embd / (hparams.n_deepstack_layers + 1 );
33483338 tok_embd = create_tensor (tn (LLM_TENSOR_TOKEN_EMBD, " weight" ), {n_embd, n_vocab}, 0 );
33493339
33503340 // output
@@ -3380,10 +3370,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
33803370 case LLM_ARCH_QWEN3MOE:
33813371 case LLM_ARCH_QWEN3VLMOE:
33823372 {
3383- // for model loading, the weights only have the main embd
3384- // so we need to divide by the number of deepstack layers + 1
3385- // n_embd is const int so we declare a new variable
3386- int64_t n_embd = hparams.n_embd / (hparams.n_deepstack_layers + 1 );
33873373 tok_embd = create_tensor (tn (LLM_TENSOR_TOKEN_EMBD, " weight" ), {n_embd, n_vocab}, 0 );
33883374
33893375 // output
@@ -6535,6 +6521,7 @@ void llama_model::print_info() const {
65356521 if (!hparams.vocab_only ) {
65366522 LLAMA_LOG_INFO (" %s: n_ctx_train = %u\n " , __func__, hparams.n_ctx_train );
65376523 LLAMA_LOG_INFO (" %s: n_embd = %u\n " , __func__, hparams.n_embd );
6524+ LLAMA_LOG_INFO (" %s: n_embd_inp = %u\n " , __func__, hparams.n_embd_inp ());
65386525 LLAMA_LOG_INFO (" %s: n_layer = %u\n " , __func__, hparams.n_layer );
65396526 LLAMA_LOG_INFO (" %s: n_head = %s\n " , __func__, print_f ([&](uint32_t il) { return hparams.n_head (il); }, hparams.n_layer ).c_str ());
65406527 LLAMA_LOG_INFO (" %s: n_head_kv = %s\n " , __func__, print_f ([&](uint32_t il) { return hparams.n_head_kv (il); }, hparams.n_layer ).c_str ());
@@ -7380,6 +7367,10 @@ int32_t llama_model_n_embd(const llama_model * model) {
73807367 return model->hparams .n_embd ;
73817368}
73827369
7370+ int32_t llama_model_n_embd_inp (const llama_model * model) {
7371+ return model->hparams .n_embd_inp ();
7372+ }
7373+
73837374int32_t llama_model_n_layer (const llama_model * model) {
73847375 return model->hparams .n_layer ;
73857376}
0 commit comments