@@ -73,17 +73,22 @@ bool llama_kv_cache::init(
7373 const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa (i) + hparams.n_embd_k_s ();
7474 const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa (i) + hparams.n_embd_v_s ();
7575
76- LLAMA_LOG_DEBUG ( " %s: layer %d: n_embd_k_gqa = %d, n_embd_v_gqa = %d \n " , __func__, i, n_embd_k_gqa, n_embd_v_gqa) ;
76+ const char * dev_name = " CPU " ;
7777
7878 ggml_backend_buffer_type_t buft;
7979 if (offload) {
8080 auto * dev = model.dev_layer (i);
8181 buft = ggml_backend_dev_buffer_type (dev);
82+
83+ dev_name = ggml_backend_dev_name (dev);
8284 } else {
8385 buft = ggml_backend_cpu_buffer_type ();
8486 }
85- ggml_context * ctx = ctx_for_buft (buft);
8687
88+ LLAMA_LOG_DEBUG (" %s: layer %3d: n_embd_k_gqa = %d, n_embd_v_gqa = %d, dev = %s\n " , __func__,
89+ i, n_embd_k_gqa, n_embd_v_gqa, dev_name);
90+
91+ ggml_context * ctx = ctx_for_buft (buft);
8792 if (!ctx) {
8893 LLAMA_LOG_ERROR (" %s: failed to create ggml context for kv cache\n " , __func__);
8994 return false ;
@@ -134,14 +139,13 @@ size_t llama_kv_cache::total_size() const {
134139 return size;
135140}
136141
137- // TODO: better data structures to reduce the cost of this operation
138- llama_pos llama_kv_cache::max_pos () const {
139- llama_pos max_pos = -1 ;
142+ llama_pos llama_kv_cache::pos_max () const {
143+ llama_pos pos_max = -1 ;
140144 for (const auto & cell : cells) {
141- max_pos = std::max (max_pos , cell.pos );
145+ pos_max = std::max (pos_max , cell.pos );
142146 }
143147
144- return max_pos ;
148+ return pos_max ;
145149}
146150
147151void llama_kv_cache::clear () {
@@ -672,6 +676,26 @@ uint32_t llama_kv_cache::cell_max() const {
672676 return 0 ;
673677}
674678
679+ size_t llama_kv_cache::size_k_bytes () const {
680+ size_t size_k_bytes = 0 ;
681+
682+ for (const auto & k : k_l) {
683+ size_k_bytes += ggml_nbytes (k);
684+ }
685+
686+ return size_k_bytes;
687+ }
688+
689+ size_t llama_kv_cache::size_v_bytes () const {
690+ size_t size_v_bytes = 0 ;
691+
692+ for (const auto & v : v_l) {
693+ size_v_bytes += ggml_nbytes (v);
694+ }
695+
696+ return size_v_bytes;
697+ }
698+
675699void llama_kv_cache_clear (llama_kv_cache * kv) {
676700 kv->clear ();
677701}
0 commit comments