diff --git a/.gitignore b/.gitignore index cc72133..49db741 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,5 @@ ggml-metal.metal News*.json sqlite-lembed.h dist/ + +tests/__pycache__/ diff --git a/CMakeLists.txt b/CMakeLists.txt index 8fdef01..da8a010 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,7 +32,7 @@ include_directories(${SQLITE_AMALGAMATION_DIR}) add_library(sqlite_lembed SHARED sqlite-lembed.c) add_dependencies(sqlite_lembed sqlite_amalgamation) -target_link_libraries(sqlite_lembed ggml_static llama) +target_link_libraries(sqlite_lembed ggml llama) target_include_directories(sqlite_lembed PRIVATE ${LLAMA_CPP_DIR}) set_target_properties(sqlite_lembed PROPERTIES PREFIX "") set_target_properties(sqlite_lembed PROPERTIES OUTPUT_NAME "lembed0") diff --git a/Makefile b/Makefile index b11db35..5a2008e 100644 --- a/Makefile +++ b/Makefile @@ -25,6 +25,13 @@ CFLAGS+=-framework Accelerate -framework Foundation -framework Metal -framework LLAMA_CMAKE_FLAGS+=-DLLAMA_METAL=0 endif +ifeq ($(shell uname -sm),Darwin x86_64) +LLAMA_CMAKE_FLAGS+=-DCMAKE_OSX_ARCHITECTURES=x86_64 +endif +ifeq ($(shell uname -sm),Darwin arm64) +LLAMA_CMAKE_FLAGS+=-DCMAKE_OSX_ARCHITECTURES=arm64 +endif + ifdef CONFIG_LINUX LOADABLE_EXTENSION=so LLAMA_CMAKE_FLAGS+=-DCMAKE_POSITION_INDEPENDENT_CODE=ON diff --git a/sqlite-lembed.c b/sqlite-lembed.c index 479a554..63b2337 100644 --- a/sqlite-lembed.c +++ b/sqlite-lembed.c @@ -31,7 +31,7 @@ static void normalize(float *vec, float *out, int n) { int tokenize(struct llama_model *model, const char *input, size_t input_length, int *token_count, llama_token **tokens) { int input_token_count_estimate = - llama_tokenize(model, input, input_length, NULL, 0, true, true); + llama_tokenize(llama_model_get_vocab(model), input, input_length, NULL, 0, true, true); if (input_token_count_estimate >= 0) { return SQLITE_ERROR; } @@ -41,7 +41,7 @@ int tokenize(struct llama_model *model, const char *input, size_t input_length, return SQLITE_NOMEM; } int input_token_count = - llama_tokenize(model, input, input_length, *tokens, + llama_tokenize(llama_model_get_vocab(model), input, input_length, *tokens, abs(input_token_count_estimate), true, true); if (input_token_count != abs(input_token_count_estimate)) { sqlite3_free(*tokens); @@ -59,7 +59,7 @@ int embed_single(struct llama_model *model, struct llama_context *context, /** Output embedding length (n dimensions) */ int *out_dimensions) { int n_batch = 512; - int n_ctx_train = llama_n_ctx_train(model); + int n_ctx_train = llama_model_n_ctx_train(model); int n_ctx = llama_n_ctx(context); llama_token *tokens; @@ -85,14 +85,14 @@ int embed_single(struct llama_model *model, struct llama_context *context, batch.n_tokens++; } - int dimensions = llama_n_embd(model); + int dimensions = llama_model_n_embd(model); float *output_embedding = sqlite3_malloc(sizeof(float) * dimensions); if(!output_embedding) { llama_batch_free(batch); return SQLITE_NOMEM; } - llama_kv_cache_clear(context); // KV not needed for embeddings? + llama_memory_clear(llama_get_memory(context), false); // KV not needed for embeddings? rc = llama_decode(context, batch); if(rc != 0) { sqlite3_free(output_embedding); @@ -143,8 +143,9 @@ void api_free(void *p) { typedef struct lembed_model_options lembed_model_options; struct lembed_model_options { int32_t n_gpu_layers; + uint32_t seed; - int8_t defined[1]; + int8_t defined[2]; }; static char *POINTER_NAME_MODEL = "lembed_model"; static char *POINTER_NAME_MODEL_OPTIONS = "lembed_model_options"; @@ -174,6 +175,11 @@ static void lembed_model_options_(sqlite3_context *context, int argc, if (sqlite3_stricmp(k, "n_gpu_layers") == 0) { o->n_gpu_layers = sqlite3_value_int(value); o->defined[0] = 1; + } else if (sqlite3_stricmp("seed", k) == 0) { + sqlite3_int64 v = sqlite3_value_int64(value); + assert(v > 0); + o->seed = v; + o->defined[1] = 1; } else { abort(); } @@ -183,12 +189,11 @@ static void lembed_model_options_(sqlite3_context *context, int argc, typedef struct lembed_context_options lembed_context_options; struct lembed_context_options { - uint32_t seed; uint32_t n_ctx; enum llama_rope_scaling_type rope_scaling_type; float rope_freq_scale; - int8_t defined[4]; + int8_t defined[3]; }; static char *POINTER_NAME_CONTEXT_OPTIONS = "lembed_context_options"; @@ -205,16 +210,11 @@ static void lembed_context_options_(sqlite3_context *context, int argc, sqlite3_value *value = argv[i + 1]; assert(sqlite3_value_type(key) == SQLITE_TEXT); const char *k = (const char *)sqlite3_value_text(key); - if (sqlite3_stricmp("seed", k) == 0) { - sqlite3_int64 v = sqlite3_value_int64(value); - assert(v > 0); - o->seed = v; - o->defined[0] = 1; - } else if (sqlite3_stricmp("n_ctx", k) == 0) { + if (sqlite3_stricmp("n_ctx", k) == 0) { sqlite3_int64 v = sqlite3_value_int64(value); assert(v > 0); o->n_ctx = v; - o->defined[1] = 1; + o->defined[0] = 1; } else if (sqlite3_stricmp("rope_scaling_type", k) == 0) { const char *v = (const char *)sqlite3_value_text(value); if (sqlite3_stricmp(v, "none")) { @@ -227,10 +227,10 @@ static void lembed_context_options_(sqlite3_context *context, int argc, abort(); } - o->defined[2] = 1; + o->defined[1] = 1; } else if (sqlite3_stricmp(k, "rope_freq_scale") == 0) { o->rope_freq_scale = sqlite3_value_double(value); - o->defined[3] = 1; + o->defined[2] = 1; } else { abort(); } @@ -249,8 +249,7 @@ static void lembed_model_from_file(sqlite3_context *context, int argc, } -static void _static_text_func(sqlite3_context *context, int argc, - sqlite3_value **argv) { +static void _static_text_func(sqlite3_context *context, int argc, sqlite3_value **argv) { UNUSED_PARAMETER(argc); UNUSED_PARAMETER(argv); sqlite3_result_text(context, sqlite3_user_data(context), -1, SQLITE_STATIC); @@ -347,7 +346,7 @@ static void lembed_token_score(sqlite3_context *context, int argc, int32_t token = sqlite3_value_int(argv[1]); - float score = llama_token_get_score(model, token); + float score = llama_vocab_get_score(llama_model_get_vocab(model), token); sqlite3_result_double(context, score); } static void lembed_token_to_piece_(sqlite3_context *context, int argc, @@ -360,7 +359,7 @@ static void lembed_token_to_piece_(sqlite3_context *context, int argc, int32_t token = sqlite3_value_int(argv[1]); #define BUFLEN 256 char buf[BUFLEN]; - int n = llama_token_to_piece(model, token, buf, BUFLEN, false); + int n = llama_token_to_piece(llama_model_get_vocab(model), token, buf, BUFLEN, 0, false); if (n) { sqlite3_result_text(context, buf, n, SQLITE_TRANSIENT); } else { @@ -470,7 +469,7 @@ static int lembed_modelsUpdate(sqlite3_vtab *pVTab, int argc, mparams.n_gpu_layers = modelOptions->n_gpu_layers; } - model = llama_load_model_from_file(modelPath, mparams); + model = llama_model_load_from_file(modelPath, mparams); if (!model) { return SQLITE_ERROR; } @@ -480,22 +479,19 @@ static int lembed_modelsUpdate(sqlite3_vtab *pVTab, int argc, cparams.embeddings = 1; if (contextOptions) { if (contextOptions->defined[0]) { - cparams.seed = contextOptions->seed; - } - if (contextOptions->defined[1]) { cparams.n_ctx = contextOptions->n_ctx; } - if (contextOptions->defined[2]) { + if (contextOptions->defined[1]) { cparams.rope_scaling_type = contextOptions->rope_scaling_type; } - if (contextOptions->defined[3]) { + if (contextOptions->defined[2]) { cparams.rope_freq_scale = contextOptions->rope_freq_scale; } } - ctx = llama_new_context_with_model(model, cparams); + ctx = llama_init_from_model(model, cparams); if (!ctx) { - llama_free_model(model); + llama_model_free(model); return SQLITE_ERROR; } p->api->models[idx].model = model; @@ -742,7 +738,7 @@ static int lembed_chunksFilter(sqlite3_vtab_cursor *pVtabCursor, int idxNum, for (int j = 0; j < chunk_size; j++) { int32_t token = tokens[i * chunk_size + j]; int32_t piece_len_neg = - llama_token_to_piece(model, token, NULL, 0, false); + llama_token_to_piece(llama_model_get_vocab(model), token, NULL, 0, 0, false); // printf("%d\n", piece_len_neg); // assert(piece_len_neg < 0); int32_t piece_len = abs(piece_len_neg); @@ -753,7 +749,7 @@ static int lembed_chunksFilter(sqlite3_vtab_cursor *pVtabCursor, int idxNum, char *piece = sqlite3_malloc(piece_len); assert(piece); - llama_token_to_piece(model, token, piece, piece_len, false); + llama_token_to_piece(llama_model_get_vocab(model), token, piece, piece_len, 0, false); // printf("'%.*s' %d ", piece_len, piece, tokens[i*chunk_size + j]); char *begin = ptr; @@ -918,4 +914,4 @@ __declspec(dllexport) sqlite3_create_module_v2(db, "lembed_chunks", &lembed_chunksModule, a, NULL); sqlite3_create_module_v2(db, "lembed_models", &lembed_modelsModule, a, NULL); return SQLITE_OK; -} +} \ No newline at end of file diff --git a/tests/test-loadable.py b/tests/test-loadable.py index bf1166c..cdd873f 100644 --- a/tests/test-loadable.py +++ b/tests/test-loadable.py @@ -120,7 +120,7 @@ def test_lembed(): a = lembed("aaa", "alex garcia") assert len(a) == (384 * 4) assert struct.unpack("1f", a[0:4])[0] == pytest.approx( - -0.09205757826566696, rel=1e-2 + 0.002983473241329193, rel=1e-2 ) with _raises( @@ -138,7 +138,7 @@ def test_lembed(): a = lembed("alex garcia") assert len(a) == (384 * 4) assert struct.unpack("1f", a[0:4])[0] == pytest.approx( - -0.09205757826566696, rel=1e-2 + 0.002983473241329193, rel=1e-2 ) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 2b33896..4fd1242 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 2b3389677a833cee0880226533a1768b1a9508d2 +Subproject commit 4fd1242bef6cb2325b4ff1c1a80f3b54b64508a6