@@ -2092,6 +2092,10 @@ struct llama_context {
20922092 struct ggml_tensor * inp_s_mask; // F32 [kv_size]
20932093 struct ggml_tensor * inp_s_seq; // I32 [kv_size, n_batch]
20942094
2095+ struct llama_control_vector * control_vector;
2096+ int32_t control_vector_layer_start;
2097+ int32_t control_vector_layer_end;
2098+
20952099#ifdef GGML_USE_MPI
20962100 ggml_mpi_context * ctx_mpi = NULL;
20972101#endif
@@ -5416,6 +5420,8 @@ static struct ggml_tensor * llm_build_kv(
54165420 return cur;
54175421}
54185422
5423+ ggml_tensor * get_control_vector_layer_tensor(struct llama_control_vector * vector, int il);
5424+
54195425struct llm_build_context {
54205426 const llama_model & model;
54215427 const llama_context & lctx;
@@ -5770,6 +5776,14 @@ struct llm_build_context {
57705776 }
57715777
57725778 cur = ggml_add(ctx0, cur, ffn_inp);
5779+ cb(cur, "ffn_out", il);
5780+
5781+ if (lctx.control_vector != nullptr && il >= lctx.control_vector_layer_start && il <= lctx.control_vector_layer_end) {
5782+ ggml_tensor * layer_dir = get_control_vector_layer_tensor(lctx.control_vector, il);
5783+ if (layer_dir != nullptr) {
5784+ cur = ggml_add(ctx0, cur, layer_dir);
5785+ }
5786+ }
57735787 cb(cur, "l_out", il);
57745788
57755789 // input for next layer
@@ -13183,6 +13197,227 @@ int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const
1318313197 }
1318413198}
1318513199
13200+ struct llama_control_vector {
13201+ struct ggml_context * ctx;
13202+ std::vector<ggml_tensor*> tensors;
13203+
13204+ llama_control_vector() : ctx(nullptr) {}
13205+
13206+ ~llama_control_vector() {
13207+ if (this->ctx) {
13208+ ggml_free(this->ctx);
13209+ }
13210+ }
13211+ };
13212+
13213+ ggml_tensor * get_control_vector_layer_tensor(struct llama_control_vector * vector, int il) {
13214+ if (!vector->ctx || il > vector->tensors.size()) {
13215+ return nullptr;
13216+ }
13217+ return vector->tensors[il];
13218+ }
13219+
13220+ struct llama_control_vector * llama_control_vector_load(const char * path) {
13221+ struct llama_control_vector * vector = new llama_control_vector();
13222+
13223+ int n_tensors;
13224+ size_t n_bytes = 0;
13225+ uint32_t max_direction_layer = 0;
13226+
13227+ // calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer
13228+ {
13229+ struct ggml_init_params meta_params = {
13230+ /* .mem_size = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(),
13231+ /* .mem_buffer = */ nullptr,
13232+ /* .no_alloc = */ true,
13233+ };
13234+ ggml_context * meta_ctx = ggml_init(meta_params);
13235+ struct gguf_init_params meta_gguf_params = {
13236+ /* .no_alloc = */ true,
13237+ /* .ctx = */ &meta_ctx,
13238+ };
13239+ struct gguf_context * meta_ctx_gguf = gguf_init_from_file(path, meta_gguf_params);
13240+ if (!meta_ctx_gguf) {
13241+ LLAMA_LOG_ERROR("%s: failed to load control vector\n", __func__);
13242+ ggml_free(meta_ctx);
13243+ return nullptr;
13244+ }
13245+
13246+ n_tensors = gguf_get_n_tensors(meta_ctx_gguf);
13247+ for (int i = 0; i < n_tensors; i++) {
13248+ std::string name = gguf_get_tensor_name(meta_ctx_gguf, i);
13249+
13250+ // split on '.'
13251+ size_t dotpos = name.find('.');
13252+ if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
13253+ try {
13254+ uint32_t layer = std::stoi(name.substr(dotpos + 1));
13255+ if (layer == 0) {
13256+ LLAMA_LOG_ERROR("%s: direction tensor invalid: %s\n", __func__, name.c_str());
13257+ ggml_free(meta_ctx);
13258+ gguf_free(meta_ctx_gguf);
13259+ return nullptr;
13260+ }
13261+ if (layer > max_direction_layer) {
13262+ max_direction_layer = layer;
13263+ }
13264+ } catch (...) {
13265+ LLAMA_LOG_ERROR("%s: direction tensor invalid: %s\n", __func__, name.c_str());
13266+ ggml_free(meta_ctx);
13267+ gguf_free(meta_ctx_gguf);
13268+ return nullptr;
13269+ }
13270+ }
13271+
13272+ struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str());
13273+ if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) {
13274+ LLAMA_LOG_ERROR("%s: direction tensor invalid: %s\n", __func__, name.c_str());
13275+ ggml_free(meta_ctx);
13276+ gguf_free(meta_ctx_gguf);
13277+ return nullptr;
13278+ }
13279+ n_bytes += ggml_nbytes(tensor_meta);
13280+ }
13281+ ggml_free(meta_ctx);
13282+ gguf_free(meta_ctx_gguf);
13283+ }
13284+
13285+ // load and scale tensors into final control vector context
13286+ struct ggml_init_params ggml_params = {
13287+ /* .mem_size = */ ggml_tensor_overhead() * n_tensors + n_bytes,
13288+ /* .mem_buffer = */ nullptr,
13289+ /* .no_alloc = */ false,
13290+ };
13291+ struct ggml_context * ctx = ggml_init(ggml_params);
13292+
13293+ struct gguf_init_params params = {
13294+ /*.no_alloc = */ false,
13295+ /*.ctx = */ &ctx,
13296+ };
13297+ struct gguf_context * ctx_gguf = gguf_init_from_file(path, params);
13298+ if (!ctx_gguf) {
13299+ LLAMA_LOG_ERROR("%s: failed to load control vector\n", __func__);
13300+ ggml_free(ctx);
13301+ return nullptr;
13302+ }
13303+
13304+ vector->ctx = ctx;
13305+ vector->tensors.push_back(nullptr); // there's never a direction vector for 0
13306+ for (uint32_t i = 1; i < max_direction_layer; i++) {
13307+ std::string name = format("direction.%d", i);
13308+ ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
13309+ if (tensor) {
13310+ vector->tensors.push_back(tensor);
13311+ // LLAMA_LOG_INFO("%s: found control vector tensor: t[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, ggml_n_dims(tensor), tensor->name, tensor->data);
13312+ } else {
13313+ vector->tensors.push_back(nullptr); // as a filler
13314+ }
13315+ }
13316+
13317+ return vector;
13318+ }
13319+
13320+ struct llama_control_vector * llama_control_vector_dup(const struct llama_control_vector * vector) {
13321+ struct llama_control_vector * new_vector = new llama_control_vector();
13322+ if (vector->ctx == nullptr) {
13323+ return new_vector;
13324+ }
13325+ struct ggml_init_params ggml_params = {
13326+ /* .mem_size = */ ggml_get_mem_size(vector->ctx),
13327+ /* .mem_buffer = */ nullptr,
13328+ /* .no_alloc = */ false,
13329+ };
13330+
13331+ struct ggml_context * ctx = ggml_init(ggml_params);
13332+
13333+ for (ggml_tensor * tensor : vector->tensors) {
13334+ if (tensor == nullptr) {
13335+ new_vector->tensors.push_back(nullptr);
13336+ } else {
13337+ ggml_tensor * new_tensor = ggml_dup_tensor(ctx, tensor);
13338+ new_vector->tensors.push_back(new_tensor);
13339+ }
13340+ }
13341+
13342+ new_vector->ctx = ctx;
13343+ return new_vector;
13344+ }
13345+
13346+ int32_t llama_control_vector_scale(struct llama_control_vector * vector, float strength) {
13347+ if (vector->ctx == nullptr) {
13348+ LLAMA_LOG_ERROR("%s: attempted to scale unloaded control vector\n", __func__);
13349+ return 1;
13350+ }
13351+
13352+ for (ggml_tensor * tensor : vector->tensors) {
13353+ if (tensor == nullptr) continue;
13354+ for (int j = 0; (int64_t)j < ggml_nelements(tensor); j++) {
13355+ float v = ggml_get_f32_1d(tensor, j);
13356+ ggml_set_f32_1d(tensor, j, v * strength);
13357+ }
13358+ }
13359+
13360+ return 0;
13361+ }
13362+
13363+ int32_t llama_control_vector_add(struct llama_control_vector * vector, const struct llama_control_vector * other) {
13364+ if (vector->ctx == nullptr || other->ctx == nullptr) {
13365+ LLAMA_LOG_ERROR("%s: attempted to add with an unloaded control vector\n", __func__);
13366+ return 1;
13367+ }
13368+
13369+ size_t size = std::max(vector->tensors.size(), other->tensors.size());
13370+ for (size_t i = 0; i < size; i++) {
13371+ if (i >= vector->tensors.size()) {
13372+ vector->tensors.push_back(nullptr);
13373+ }
13374+
13375+ ggml_tensor * other_tensor = i < other->tensors.size() ? other->tensors[i] : nullptr;
13376+ if (other_tensor != nullptr) {
13377+ if (vector->tensors[i] == nullptr) {
13378+ ggml_tensor * new_tensor = ggml_dup_tensor(vector->ctx, other_tensor);
13379+ vector->tensors[i] = new_tensor;
13380+ } else {
13381+ ggml_tensor * this_tensor = vector->tensors[i];
13382+ size_t this_nelements = ggml_nelements(this_tensor);
13383+ size_t other_nelements = ggml_nelements(other_tensor);
13384+
13385+ if (this_nelements != other_nelements) {
13386+ LLAMA_LOG_ERROR("%s: attempted to add control vectors of incompatible dimension: %zu != %zu\n", __func__, this_nelements, other_nelements);
13387+ return 1;
13388+ }
13389+
13390+ for (size_t j = 0; j < this_nelements; j++) {
13391+ float a = ggml_get_f32_1d(this_tensor, j);
13392+ float b = ggml_get_f32_1d(other_tensor, j);
13393+ ggml_set_f32_1d(this_tensor, j, a + b);
13394+ }
13395+ }
13396+ }
13397+ }
13398+
13399+ return 0;
13400+ }
13401+
13402+ void llama_control_vector_free(struct llama_control_vector * vector) {
13403+ delete vector;
13404+ }
13405+
13406+ void llama_apply_control_vector(
13407+ struct llama_context * lctx,
13408+ struct llama_control_vector * vector,
13409+ int32_t control_vector_layer_start,
13410+ int32_t control_vector_layer_end
13411+ ) {
13412+ lctx->control_vector = vector;
13413+ lctx->control_vector_layer_start = control_vector_layer_start;
13414+ lctx->control_vector_layer_end = control_vector_layer_end;
13415+ }
13416+
13417+ void llama_clear_control_vector(struct llama_context * lctx) {
13418+ lctx->control_vector = nullptr;
13419+ }
13420+
1318613421struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq) {
1318713422 struct llama_kv_cache_view result = {
1318813423 /*.n_cells = */ 0,
0 commit comments