context : reduce virtuals + remove test function

ggerganov · ggerganov · commit 5bb8a26c64bf · 2025-03-05T10:53:14.000+02:00
ggml-ci
diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp
@@ -1,6 +1,6 @@
 #include "ggml.h"
 #include "llama.h"
-#include "llama-context.h"
+#include "llama-model.h"
 #include "common.h"
 
 #include <algorithm>
@@ -328,7 +328,7 @@ int main(int argc, char ** argv) {
         }
     }
 
-    const auto & tensors = llama_internal_get_tensor_map(ctx);
+    const auto & tensors = llama_internal_get_tensor_map(model);
 
     // check layer tensors
     int included_layers = 0;
diff --git a/src/llama-adapter.h b/src/llama-adapter.h
@@ -73,4 +73,4 @@ struct llama_adapter_lora {
     llama_adapter_lora_weight * get_weight(struct ggml_tensor * w);
 };
 
-using llama_loras = std::unordered_map<struct llama_adapter_lora *, float>;
+using llama_adapter_loras = std::unordered_map<struct llama_adapter_lora *, float>;
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -1,8 +1,10 @@
 #include "llama-context.h"
 
 #include "llama-impl.h"
-#include "llama-mmap.h"
 #include "llama-io.h"
+#include "llama-mmap.h"
+#include "llama-model.h"
+#include "llama-kv-cache.h"
 
 #include <cstring>
 #include <stdexcept>
@@ -2288,10 +2290,6 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
     return 0;
 }
 
-ggml_cgraph * llama_context_kv_self::graph_init() {
-    return llama_context_base::graph_init();
-}
-
 llm_graph_result_ptr llama_context_kv_self::graph_build(
             ggml_context * ctx,
              ggml_cgraph * gf,
@@ -2735,10 +2733,6 @@ int llama_context_recurrent::decode(llama_batch & inp_batch) {
     return 0;
 }
 
-ggml_cgraph * llama_context_recurrent::graph_init() {
-    return llama_context_base::graph_init();
-}
-
 llm_graph_result_ptr llama_context_recurrent::graph_build(
             ggml_context * ctx,
              ggml_cgraph * gf,
@@ -2956,10 +2950,6 @@ void llama_context_dec::reserve() {
     llama_context_kv_self::reserve();
 }
 
-ggml_cgraph * llama_context_dec::graph_init() {
-    return llama_context_kv_self::graph_init();
-}
-
 llm_graph_result_ptr llama_context_dec::graph_build(
             ggml_context * ctx,
              ggml_cgraph * gf,
@@ -3663,10 +3653,3 @@ int32_t llama_decode(
 
     return ret;
 }
-
-
-const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
-    struct llama_context * ctx
-) {
-    return ctx->get_model().tensors_by_name;
-}
diff --git a/src/llama-context.h b/src/llama-context.h
@@ -4,15 +4,16 @@
 #include "llama-batch.h"
 #include "llama-cparams.h"
 #include "llama-graph.h"
-#include "llama-model.h"
-#include "llama-kv-cache.h"
 #include "llama-adapter.h"
 
 #include "ggml-cpp.h"
 
 #include <map>
 #include <vector>
 
+struct llama_model;
+struct llama_kv_cache;
+
 class llama_io_read_i;
 class llama_io_write_i;
 
@@ -244,28 +245,29 @@ class llama_context_base : public llama_context {
 
     // Make sure enough space is available for outputs.
     // Returns max number of outputs for which space was reserved.
-    virtual int32_t output_reserve(int32_t n_outputs);
+    int32_t output_reserve(int32_t n_outputs);
 
     // make the outputs have the same order they had in the user-provided batch
     // TODO: maybe remove this
-    virtual void output_reorder();
+    void output_reorder();
 
     //
     // graph
     //
 
-    virtual int32_t graph_max_nodes() const;
+    int32_t graph_max_nodes() const;
 
     // zero-out inputs and create the ctx_compute for the compute graph
-    virtual ggml_cgraph * graph_init();
+    ggml_cgraph * graph_init();
 
+    // override this method in order to pass custom set of parameters to the llm_graph_context
     virtual llm_graph_result_ptr graph_build(
             ggml_context * ctx,
              ggml_cgraph * gf,
       const llama_ubatch & ubatch);
 
     // returns the result of ggml_backend_sched_graph_compute_async execution
-    virtual enum ggml_status graph_compute(
+    enum ggml_status graph_compute(
             ggml_cgraph * gf,
                    bool   batched);
 
@@ -330,6 +332,8 @@ class llama_context_base : public llama_context {
                 size_t   n_token_count) override;
 
 protected:
+    // override these to store all relevant state for the specific context
+    // TODO: read/write adapters
     virtual size_t state_write_data(llama_io_write_i & io);
     virtual size_t state_read_data (llama_io_read_i  & io);
 
@@ -345,10 +349,10 @@ class llama_context_base : public llama_context {
 
     const llm_graph_type gtype;
 
-    llama_cparams      cparams;
-    llama_adapter_cvec cvec;
-    llama_loras        loras;
-    llama_sbatch       sbatch;
+    llama_cparams       cparams;
+    llama_adapter_cvec  cvec;
+    llama_adapter_loras loras;
+    llama_sbatch        sbatch;
 
     ggml_backend_sched_ptr sched;
 
@@ -431,8 +435,6 @@ class llama_context_kv_self : public llama_context_base {
     // graph
     //
 
-    ggml_cgraph * graph_init() override;
-
     llm_graph_result_ptr graph_build(
             ggml_context * ctx,
              ggml_cgraph * gf,
@@ -482,8 +484,6 @@ class llama_context_recurrent : public llama_context_base {
     // graph
     //
 
-    ggml_cgraph * graph_init() override;
-
     llm_graph_result_ptr graph_build(
             ggml_context * ctx,
              ggml_cgraph * gf,
@@ -532,8 +532,6 @@ class llama_context_dec : public llama_context_kv_self {
     // graph
     //
 
-    ggml_cgraph * graph_init() override;
-
     llm_graph_result_ptr graph_build(
             ggml_context * ctx,
              ggml_cgraph * gf,
@@ -677,7 +675,3 @@ class llama_context_enc_dec : public llama_context {
 
     llama_cross cross;
 };
-
-// For internal test use
-// TODO: remove
-const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(struct llama_context * ctx);
diff --git a/src/llama-graph.h b/src/llama-graph.h
@@ -366,10 +366,10 @@ struct llm_graph_params {
     ggml_backend * backend_cpu;
     const std::vector<ggml_backend_ptr> & backends;
 
-    const llama_adapter_cvec * cvec;
-    const llama_loras        * loras;
-    const llama_memory_i     * memory;
-    const llama_cross        * cross;
+    const llama_adapter_cvec  * cvec;
+    const llama_adapter_loras * loras;
+    const llama_memory_i      * memory;
+    const llama_cross         * cross;
 
     int32_t n_outputs;
 };
@@ -420,10 +420,10 @@ struct llm_graph_context {
     ggml_backend * backend_cpu;
     const std::vector<ggml_backend_ptr> & backends;
 
-    const llama_adapter_cvec * cvec;
-    const llama_loras        * loras;
-    const llama_memory_i     * memory;
-    const llama_cross        * cross;
+    const llama_adapter_cvec  * cvec;
+    const llama_adapter_loras * loras;
+    const llama_memory_i      * memory;
+    const llama_cross         * cross;
 
     std::unique_ptr<llm_graph_result> res;
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -10845,3 +10845,7 @@ bool llama_model_is_recurrent(const struct llama_model * model) {
         default:              return false;
     }
 }
+
+const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {
+    return model->tensors_by_name;
+}
diff --git a/src/llama-model.h b/src/llama-model.h
@@ -385,3 +385,7 @@ struct llama_model {
 };
 
 const char * llm_type_name(llm_type type);
+
+// For internal test use
+// TODO: remove
+const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model);

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`#include "ggml.h"`
`2`	`2`	`#include "llama.h"`
`3`		`-#include "llama-context.h"`
	`3`	`+#include "llama-model.h"`
`4`	`4`	`#include "common.h"`
`5`	`5`
`6`	`6`	`#include <algorithm>`
`@@ -328,7 +328,7 @@ int main(int argc, char ** argv) {`
`328`	`328`	`}`
`329`	`329`	`}`
`330`	`330`
`331`		`- const auto & tensors = llama_internal_get_tensor_map(ctx);`
	`331`	`+ const auto & tensors = llama_internal_get_tensor_map(model);`
`332`	`332`
`333`	`333`	`// check layer tensors`
`334`	`334`	`int included_layers = 0;`
Original file line number	Diff line number	Diff line change
`@@ -10845,3 +10845,7 @@ bool llama_model_is_recurrent(const struct llama_model * model) {`
`10845`	`10845`	`default: return false;`
`10846`	`10846`	`}`
`10847`	`10847`	`}`
	`10848`	`+`
	`10849`	`+const std::vector<std::pair<std::string, ggml_tensor >> & llama_internal_get_tensor_map(const llama_model model) {`
	`10850`	`+ return model->tensors_by_name;`
	`10851`	`+}`