control vector api and implementation

vgel · vgel · commit 7ec24b47db0d · 2024-03-09T20:24:00.000-08:00
diff --git a/llama.cpp b/llama.cpp
@@ -2092,6 +2092,10 @@ struct llama_context {
     struct ggml_tensor * inp_s_mask;    // F32 [kv_size]
     struct ggml_tensor * inp_s_seq;     // I32 [kv_size, n_batch]
 
+    struct llama_control_vector * control_vector;
+    int32_t control_vector_layer_start;
+    int32_t control_vector_layer_end;
+
 #ifdef GGML_USE_MPI
     ggml_mpi_context * ctx_mpi = NULL;
 #endif
@@ -5416,6 +5420,8 @@ static struct ggml_tensor * llm_build_kv(
     return cur;
 }
 
+ggml_tensor * get_control_vector_layer_tensor(struct llama_control_vector * vector, int il); 
+
 struct llm_build_context {
     const llama_model    & model;
     const llama_context  & lctx;
@@ -5770,6 +5776,14 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "ffn_out", il);
+
+            if (lctx.control_vector != nullptr && il >= lctx.control_vector_layer_start && il <= lctx.control_vector_layer_end) {
+                ggml_tensor * layer_dir = get_control_vector_layer_tensor(lctx.control_vector, il);
+                if (layer_dir != nullptr) {
+                    cur = ggml_add(ctx0, cur, layer_dir);
+                }
+            }
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -13183,6 +13197,227 @@ int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const
     }
 }
 
+struct llama_control_vector {
+    struct ggml_context * ctx;
+    std::vector<ggml_tensor*> tensors;
+
+    llama_control_vector() : ctx(nullptr) {}
+
+    ~llama_control_vector() {
+        if (this->ctx) {
+            ggml_free(this->ctx);
+        }
+    }
+};
+
+ggml_tensor * get_control_vector_layer_tensor(struct llama_control_vector * vector, int il) {
+    if (!vector->ctx || il > vector->tensors.size()) {
+        return nullptr;
+    }
+    return vector->tensors[il];
+}
+
+struct llama_control_vector * llama_control_vector_load(const char * path) {
+    struct llama_control_vector * vector = new llama_control_vector();
+
+    int n_tensors;
+    size_t n_bytes = 0;
+    uint32_t max_direction_layer = 0;
+
+    // calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer
+    {
+        struct ggml_init_params meta_params = {
+            /* .mem_size   = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(),
+            /* .mem_buffer = */ nullptr,
+            /* .no_alloc   = */ true,
+        };
+        ggml_context * meta_ctx = ggml_init(meta_params);
+        struct gguf_init_params meta_gguf_params = {
+            /* .no_alloc = */ true,
+            /* .ctx      = */ &meta_ctx,
+        };
+        struct gguf_context * meta_ctx_gguf = gguf_init_from_file(path, meta_gguf_params);
+        if (!meta_ctx_gguf) {
+            LLAMA_LOG_ERROR("%s: failed to load control vector\n", __func__);
+            ggml_free(meta_ctx);
+            return nullptr;
+        }
+
+        n_tensors = gguf_get_n_tensors(meta_ctx_gguf);
+        for (int i = 0; i < n_tensors; i++) {
+            std::string name = gguf_get_tensor_name(meta_ctx_gguf, i);
+
+            // split on '.'
+            size_t dotpos = name.find('.');
+            if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
+                try {
+                    uint32_t layer = std::stoi(name.substr(dotpos + 1));
+                    if (layer == 0) {
+                        LLAMA_LOG_ERROR("%s: direction tensor invalid: %s\n", __func__, name.c_str());
+                        ggml_free(meta_ctx);
+                        gguf_free(meta_ctx_gguf);
+                        return nullptr;
+                    }
+                    if (layer > max_direction_layer) {
+                        max_direction_layer = layer;
+                    }
+                } catch (...) {
+                    LLAMA_LOG_ERROR("%s: direction tensor invalid: %s\n", __func__, name.c_str());
+                    ggml_free(meta_ctx);
+                    gguf_free(meta_ctx_gguf);
+                    return nullptr;
+                }
+            }
+
+            struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str());
+            if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) {
+                LLAMA_LOG_ERROR("%s: direction tensor invalid: %s\n", __func__, name.c_str());
+                ggml_free(meta_ctx);
+                gguf_free(meta_ctx_gguf);
+                return nullptr;
+            }
+            n_bytes += ggml_nbytes(tensor_meta);
+        }
+        ggml_free(meta_ctx);
+        gguf_free(meta_ctx_gguf);
+    }
+
+    // load and scale tensors into final control vector context
+    struct ggml_init_params ggml_params = {
+        /* .mem_size   = */ ggml_tensor_overhead() * n_tensors + n_bytes,
+        /* .mem_buffer = */ nullptr,
+        /* .no_alloc   = */ false,
+    };
+    struct ggml_context * ctx = ggml_init(ggml_params);
+
+    struct gguf_init_params params = {
+        /*.no_alloc = */ false,
+        /*.ctx      = */ &ctx,
+    };
+    struct gguf_context * ctx_gguf = gguf_init_from_file(path, params);
+    if (!ctx_gguf) {
+        LLAMA_LOG_ERROR("%s: failed to load control vector\n", __func__);
+        ggml_free(ctx);
+        return nullptr;
+    }
+
+    vector->ctx = ctx;
+    vector->tensors.push_back(nullptr); // there's never a direction vector for 0
+    for (uint32_t i = 1; i < max_direction_layer; i++) {
+        std::string name = format("direction.%d", i);
+        ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
+        if (tensor) {
+            vector->tensors.push_back(tensor);
+            // LLAMA_LOG_INFO("%s: found control vector tensor: t[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, ggml_n_dims(tensor), tensor->name, tensor->data);
+        } else {
+            vector->tensors.push_back(nullptr); // as a filler
+        }
+    }
+
+    return vector;
+}
+
+struct llama_control_vector * llama_control_vector_dup(const struct llama_control_vector * vector) {
+    struct llama_control_vector * new_vector = new llama_control_vector();
+    if (vector->ctx == nullptr) {
+        return new_vector;
+    }
+    struct ggml_init_params ggml_params = {
+        /* .mem_size   = */ ggml_get_mem_size(vector->ctx),
+        /* .mem_buffer = */ nullptr,
+        /* .no_alloc   = */ false,
+    };
+
+    struct ggml_context * ctx = ggml_init(ggml_params);
+
+    for (ggml_tensor * tensor : vector->tensors) {
+        if (tensor == nullptr) {
+            new_vector->tensors.push_back(nullptr);
+        } else {
+            ggml_tensor * new_tensor = ggml_dup_tensor(ctx, tensor);
+            new_vector->tensors.push_back(new_tensor);
+        }
+    }
+
+    new_vector->ctx = ctx;
+    return new_vector;
+}
+
+int32_t llama_control_vector_scale(struct llama_control_vector * vector, float strength) {
+    if (vector->ctx == nullptr) {
+        LLAMA_LOG_ERROR("%s: attempted to scale unloaded control vector\n", __func__);
+        return 1;
+    }
+
+    for (ggml_tensor * tensor : vector->tensors) {
+        if (tensor == nullptr) continue;
+        for (int j = 0; (int64_t)j < ggml_nelements(tensor); j++) {
+            float v = ggml_get_f32_1d(tensor, j);
+            ggml_set_f32_1d(tensor, j, v * strength);
+        }
+    }
+
+    return 0;
+}
+
+int32_t llama_control_vector_add(struct llama_control_vector * vector, const struct llama_control_vector * other) {
+    if (vector->ctx == nullptr || other->ctx == nullptr) {
+        LLAMA_LOG_ERROR("%s: attempted to add with an unloaded control vector\n", __func__);
+        return 1;
+    }
+
+    size_t size = std::max(vector->tensors.size(), other->tensors.size());
+    for (size_t i = 0; i < size; i++) {
+        if (i >= vector->tensors.size()) {
+            vector->tensors.push_back(nullptr);
+        }
+
+        ggml_tensor * other_tensor = i < other->tensors.size() ? other->tensors[i] : nullptr;
+        if (other_tensor != nullptr) {
+            if (vector->tensors[i] == nullptr) {
+                ggml_tensor * new_tensor = ggml_dup_tensor(vector->ctx, other_tensor);
+                vector->tensors[i] = new_tensor;
+            } else {
+                ggml_tensor * this_tensor = vector->tensors[i];
+                size_t this_nelements = ggml_nelements(this_tensor);
+                size_t other_nelements = ggml_nelements(other_tensor);
+
+                if (this_nelements != other_nelements) {
+                    LLAMA_LOG_ERROR("%s: attempted to add control vectors of incompatible dimension: %zu != %zu\n", __func__, this_nelements, other_nelements);
+                    return 1;
+                }
+
+                for (size_t j = 0; j < this_nelements; j++) {
+                    float a = ggml_get_f32_1d(this_tensor, j);
+                    float b = ggml_get_f32_1d(other_tensor, j);
+                    ggml_set_f32_1d(this_tensor, j, a + b);
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+void llama_control_vector_free(struct llama_control_vector * vector) {
+    delete vector;
+}
+
+void llama_apply_control_vector(
+    struct llama_context * lctx,
+    struct llama_control_vector * vector,
+    int32_t control_vector_layer_start,
+    int32_t control_vector_layer_end
+) {
+    lctx->control_vector = vector;
+    lctx->control_vector_layer_start = control_vector_layer_start;
+    lctx->control_vector_layer_end = control_vector_layer_end;
+}
+
+void llama_clear_control_vector(struct llama_context * lctx) {
+    lctx->control_vector = nullptr;
+}
+
 struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq) {
     struct llama_kv_cache_view result = {
         /*.n_cells            = */ 0,
diff --git a/llama.h b/llama.h
@@ -4,6 +4,7 @@
 #include "ggml.h"
 #include "ggml-backend.h"
 
+#include <cstdint>
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
@@ -436,6 +437,29 @@ extern "C" {
                            float   scale,
                       const char * path_base_model,
                          int32_t   n_threads);
+    
+    struct llama_control_vector;
+
+    LLAMA_API struct llama_control_vector * llama_control_vector_load(const char * path);
+    LLAMA_API struct llama_control_vector * llama_control_vector_dup(const struct llama_control_vector * vector);
+    LLAMA_API int32_t llama_control_vector_scale(struct llama_control_vector * vector, float strength);
+    LLAMA_API int32_t llama_control_vector_add(struct llama_control_vector * vector, const struct llama_control_vector * other);
+    LLAMA_API void llama_control_vector_free(struct llama_control_vector * vector);
+    LLAMA_API void llama_apply_control_vector(
+                   struct llama_context *   lctx,
+            struct llama_control_vector * vector,
+              int32_t control_vector_layer_start,
+              int32_t control_vector_layer_end);
+    LLAMA_API void llama_clear_control_vector(struct llama_context * lctx);
+
+
+    // Apply a control vector to a model context
+    LLAMA_API int32_t llama_load_control_vector_from_file(
+                  struct llama_context * lctx,
+             const char * control_vector_path,
+                               float strength,
+                          int32_t layer_start,
+                          int32_t layer_end);
 
     //
     // KV cache