|
2 | 2 |
|
3 | 3 | #include "llama-chat.h" |
4 | 4 | #include "llama-mmap.h" |
5 | | -#include "llama-context.h" |
6 | 5 | #include "llama-vocab.h" |
7 | 6 | #include "llama-model-loader.h" |
8 | 7 | #include "llama-model.h" |
|
25 | 24 | // interface implementation |
26 | 25 | // |
27 | 26 |
|
28 | | -struct llama_context_params llama_context_default_params() { |
29 | | - struct llama_context_params result = { |
30 | | - /*.n_ctx =*/ 512, |
31 | | - /*.n_batch =*/ 2048, |
32 | | - /*.n_ubatch =*/ 512, |
33 | | - /*.n_seq_max =*/ 1, |
34 | | - /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default |
35 | | - /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS, |
36 | | - /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED, |
37 | | - /*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED, |
38 | | - /*.attention_type =*/ LLAMA_ATTENTION_TYPE_UNSPECIFIED, |
39 | | - /*.rope_freq_base =*/ 0.0f, |
40 | | - /*.rope_freq_scale =*/ 0.0f, |
41 | | - /*.yarn_ext_factor =*/ -1.0f, |
42 | | - /*.yarn_attn_factor =*/ 1.0f, |
43 | | - /*.yarn_beta_fast =*/ 32.0f, |
44 | | - /*.yarn_beta_slow =*/ 1.0f, |
45 | | - /*.yarn_orig_ctx =*/ 0, |
46 | | - /*.defrag_thold =*/ -1.0f, |
47 | | - /*.cb_eval =*/ nullptr, |
48 | | - /*.cb_eval_user_data =*/ nullptr, |
49 | | - /*.type_k =*/ GGML_TYPE_F16, |
50 | | - /*.type_v =*/ GGML_TYPE_F16, |
51 | | - /*.logits_all =*/ false, |
52 | | - /*.embeddings =*/ false, |
53 | | - /*.offload_kqv =*/ true, |
54 | | - /*.flash_attn =*/ false, |
55 | | - /*.no_perf =*/ true, |
56 | | - /*.abort_callback =*/ nullptr, |
57 | | - /*.abort_callback_data =*/ nullptr, |
58 | | - }; |
59 | | - |
60 | | - return result; |
61 | | -} |
62 | | - |
63 | 27 | struct llama_sampler_chain_params llama_sampler_chain_default_params() { |
64 | 28 | struct llama_sampler_chain_params result = { |
65 | 29 | /*.no_perf =*/ true, |
@@ -289,80 +253,6 @@ struct llama_model * llama_model_load_from_splits( |
289 | 253 | return llama_model_load_from_file_impl(splits.front(), splits, params); |
290 | 254 | } |
291 | 255 |
|
292 | | -struct llama_context * llama_init_from_model( |
293 | | - struct llama_model * model, |
294 | | - struct llama_context_params params) { |
295 | | - |
296 | | - if (!model) { |
297 | | - LLAMA_LOG_ERROR("%s: model cannot be NULL\n", __func__); |
298 | | - return nullptr; |
299 | | - } |
300 | | - |
301 | | - if (params.n_batch == 0 && params.n_ubatch == 0) { |
302 | | - LLAMA_LOG_ERROR("%s: n_batch and n_ubatch cannot both be zero\n", __func__); |
303 | | - return nullptr; |
304 | | - } |
305 | | - |
306 | | - if (params.n_ctx == 0 && model->hparams.n_ctx_train == 0) { |
307 | | - LLAMA_LOG_ERROR("%s: n_ctx and model->hparams.n_ctx_train cannot both be zero\n", __func__); |
308 | | - return nullptr; |
309 | | - } |
310 | | - |
311 | | - if (params.flash_attn && model->arch == LLM_ARCH_GROK) { |
312 | | - LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__); |
313 | | - params.flash_attn = false; |
314 | | - } |
315 | | - |
316 | | - if (params.flash_attn && model->hparams.n_embd_head_k != model->hparams.n_embd_head_v) { |
317 | | - LLAMA_LOG_WARN("%s: flash_attn requires n_embd_head_k == n_embd_head_v - forcing off\n", __func__); |
318 | | - params.flash_attn = false; |
319 | | - } |
320 | | - |
321 | | - if (ggml_is_quantized(params.type_v) && !params.flash_attn) { |
322 | | - LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__); |
323 | | - return nullptr; |
324 | | - } |
325 | | - |
326 | | - llama_context * ctx = nullptr; |
327 | | - |
328 | | - try { |
329 | | - // TODO: make static method of llama_context |
330 | | - switch (model->arch) { |
331 | | - case LLM_ARCH_BERT: |
332 | | - case LLM_ARCH_JINA_BERT_V2: |
333 | | - case LLM_ARCH_NOMIC_BERT: |
334 | | - ctx = new llama_context_enc(*model, params, LLM_GRAPH_TYPE_DEFAULT); |
335 | | - break; |
336 | | - case LLM_ARCH_T5: |
337 | | - ctx = new llama_context_enc_dec(*model, params); |
338 | | - break; |
339 | | - case LLM_ARCH_RWKV6: |
340 | | - case LLM_ARCH_RWKV6QWEN2: |
341 | | - case LLM_ARCH_MAMBA: |
342 | | - GGML_ASSERT(llama_model_is_recurrent(model)); |
343 | | - ctx = new llama_context_recurrent(*model, params, LLM_GRAPH_TYPE_DEFAULT); |
344 | | - break; |
345 | | - default: |
346 | | - GGML_ASSERT(!llama_model_is_recurrent(model)); |
347 | | - ctx = new llama_context_kv_self(*model, params, LLM_GRAPH_TYPE_DEFAULT); |
348 | | - }; |
349 | | - |
350 | | - ctx->init(); |
351 | | - } catch (const std::exception & e) { |
352 | | - LLAMA_LOG_ERROR("%s: failed to initialize context: %s\n", __func__, e.what()); |
353 | | - return nullptr; |
354 | | - } |
355 | | - |
356 | | - return ctx; |
357 | | -} |
358 | | - |
359 | | -// deprecated |
360 | | -struct llama_context * llama_new_context_with_model( |
361 | | - struct llama_model * model, |
362 | | - struct llama_context_params params) { |
363 | | - return llama_init_from_model(model, params); |
364 | | -} |
365 | | - |
366 | 256 | // |
367 | 257 | // chat templates |
368 | 258 | // |
@@ -448,36 +338,3 @@ const char * llama_print_system_info(void) { |
448 | 338 |
|
449 | 339 | return s.c_str(); |
450 | 340 | } |
451 | | - |
452 | | -// |
453 | | -// perf |
454 | | -// |
455 | | - |
456 | | -struct llama_perf_context_data llama_perf_context(const struct llama_context * ctx) { |
457 | | - struct llama_perf_context_data data = {}; |
458 | | - |
459 | | - if (ctx == nullptr) { |
460 | | - return data; |
461 | | - } |
462 | | - |
463 | | - data = ctx->perf_get_data(); |
464 | | - |
465 | | - return data; |
466 | | -} |
467 | | - |
468 | | -void llama_perf_context_print(const struct llama_context * ctx) { |
469 | | - const auto data = llama_perf_context(ctx); |
470 | | - |
471 | | - const double t_end_ms = 1e-3 * ggml_time_us(); |
472 | | - |
473 | | - LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, data.t_load_ms); |
474 | | - LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", |
475 | | - __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval); |
476 | | - LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", |
477 | | - __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval); |
478 | | - LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval)); |
479 | | -} |
480 | | - |
481 | | -void llama_perf_context_reset(struct llama_context * ctx) { |
482 | | - ctx->perf_reset(); |
483 | | -} |
0 commit comments