diff --git a/.gitignore b/.gitignore
index cc72133..49db741 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,3 +10,5 @@ ggml-metal.metal
 News*.json
 sqlite-lembed.h
 dist/
+
+tests/__pycache__/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8fdef01..da8a010 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,7 +32,7 @@ include_directories(${SQLITE_AMALGAMATION_DIR})
 
 add_library(sqlite_lembed SHARED sqlite-lembed.c)
 add_dependencies(sqlite_lembed sqlite_amalgamation)
-target_link_libraries(sqlite_lembed ggml_static llama)
+target_link_libraries(sqlite_lembed ggml llama)
 target_include_directories(sqlite_lembed PRIVATE ${LLAMA_CPP_DIR})
 set_target_properties(sqlite_lembed PROPERTIES PREFIX "")
 set_target_properties(sqlite_lembed PROPERTIES OUTPUT_NAME "lembed0")
diff --git a/Makefile b/Makefile
index b11db35..5a2008e 100644
--- a/Makefile
+++ b/Makefile
@@ -25,6 +25,13 @@ CFLAGS+=-framework Accelerate -framework Foundation -framework Metal -framework
 LLAMA_CMAKE_FLAGS+=-DLLAMA_METAL=0
 endif
 
+ifeq ($(shell uname -sm),Darwin x86_64)
+LLAMA_CMAKE_FLAGS+=-DCMAKE_OSX_ARCHITECTURES=x86_64
+endif
+ifeq ($(shell uname -sm),Darwin arm64)
+LLAMA_CMAKE_FLAGS+=-DCMAKE_OSX_ARCHITECTURES=arm64
+endif
+
 ifdef CONFIG_LINUX
 LOADABLE_EXTENSION=so
 LLAMA_CMAKE_FLAGS+=-DCMAKE_POSITION_INDEPENDENT_CODE=ON
diff --git a/sqlite-lembed.c b/sqlite-lembed.c
index 479a554..63b2337 100644
--- a/sqlite-lembed.c
+++ b/sqlite-lembed.c
@@ -31,7 +31,7 @@ static void normalize(float *vec, float *out, int n) {
 int tokenize(struct llama_model *model, const char *input, size_t input_length,
              int *token_count, llama_token **tokens) {
   int input_token_count_estimate =
-      llama_tokenize(model, input, input_length, NULL, 0, true, true);
+      llama_tokenize(llama_model_get_vocab(model), input, input_length, NULL, 0, true, true);
   if (input_token_count_estimate >= 0) {
     return SQLITE_ERROR;
   }
@@ -41,7 +41,7 @@ int tokenize(struct llama_model *model, const char *input, size_t input_length,
     return SQLITE_NOMEM;
   }
   int input_token_count =
-      llama_tokenize(model, input, input_length, *tokens,
+      llama_tokenize(llama_model_get_vocab(model), input, input_length, *tokens,
                      abs(input_token_count_estimate), true, true);
   if (input_token_count != abs(input_token_count_estimate)) {
     sqlite3_free(*tokens);
@@ -59,7 +59,7 @@ int embed_single(struct llama_model *model, struct llama_context *context,
                  /** Output embedding length (n dimensions) */
                  int *out_dimensions) {
   int n_batch = 512;
-  int n_ctx_train = llama_n_ctx_train(model);
+  int n_ctx_train = llama_model_n_ctx_train(model);
   int n_ctx = llama_n_ctx(context);
 
   llama_token *tokens;
@@ -85,14 +85,14 @@ int embed_single(struct llama_model *model, struct llama_context *context,
     batch.n_tokens++;
   }
 
-  int dimensions = llama_n_embd(model);
+  int dimensions = llama_model_n_embd(model);
   float *output_embedding = sqlite3_malloc(sizeof(float) * dimensions);
   if(!output_embedding) {
     llama_batch_free(batch);
     return SQLITE_NOMEM;
   }
 
-  llama_kv_cache_clear(context); // KV not needed for embeddings?
+  llama_memory_clear(llama_get_memory(context), false); // KV not needed for embeddings?
   rc = llama_decode(context, batch);
   if(rc != 0) {
     sqlite3_free(output_embedding);
@@ -143,8 +143,9 @@ void api_free(void *p) {
 typedef struct lembed_model_options lembed_model_options;
 struct lembed_model_options {
   int32_t n_gpu_layers;
+  uint32_t seed;
 
-  int8_t defined[1];
+  int8_t defined[2];
 };
 static char *POINTER_NAME_MODEL = "lembed_model";
 static char *POINTER_NAME_MODEL_OPTIONS = "lembed_model_options";
@@ -174,6 +175,11 @@ static void lembed_model_options_(sqlite3_context *context, int argc,
     if (sqlite3_stricmp(k, "n_gpu_layers") == 0) {
       o->n_gpu_layers = sqlite3_value_int(value);
       o->defined[0] = 1;
+    } else if (sqlite3_stricmp("seed", k) == 0) {
+      sqlite3_int64 v = sqlite3_value_int64(value);
+      assert(v > 0);
+      o->seed = v;
+      o->defined[1] = 1;
     } else {
       abort();
     }
@@ -183,12 +189,11 @@ static void lembed_model_options_(sqlite3_context *context, int argc,
 
 typedef struct lembed_context_options lembed_context_options;
 struct lembed_context_options {
-  uint32_t seed;
   uint32_t n_ctx;
   enum llama_rope_scaling_type rope_scaling_type;
   float rope_freq_scale;
 
-  int8_t defined[4];
+  int8_t defined[3];
 };
 static char *POINTER_NAME_CONTEXT_OPTIONS = "lembed_context_options";
 
@@ -205,16 +210,11 @@ static void lembed_context_options_(sqlite3_context *context, int argc,
     sqlite3_value *value = argv[i + 1];
     assert(sqlite3_value_type(key) == SQLITE_TEXT);
     const char *k = (const char *)sqlite3_value_text(key);
-    if (sqlite3_stricmp("seed", k) == 0) {
-      sqlite3_int64 v = sqlite3_value_int64(value);
-      assert(v > 0);
-      o->seed = v;
-      o->defined[0] = 1;
-    } else if (sqlite3_stricmp("n_ctx", k) == 0) {
+    if (sqlite3_stricmp("n_ctx", k) == 0) {
       sqlite3_int64 v = sqlite3_value_int64(value);
       assert(v > 0);
       o->n_ctx = v;
-      o->defined[1] = 1;
+      o->defined[0] = 1;
     } else if (sqlite3_stricmp("rope_scaling_type", k) == 0) {
       const char *v = (const char *)sqlite3_value_text(value);
       if (sqlite3_stricmp(v, "none")) {
@@ -227,10 +227,10 @@ static void lembed_context_options_(sqlite3_context *context, int argc,
         abort();
       }
 
-      o->defined[2] = 1;
+      o->defined[1] = 1;
     } else if (sqlite3_stricmp(k, "rope_freq_scale") == 0) {
       o->rope_freq_scale = sqlite3_value_double(value);
-      o->defined[3] = 1;
+      o->defined[2] = 1;
     } else {
       abort();
     }
@@ -249,8 +249,7 @@ static void lembed_model_from_file(sqlite3_context *context, int argc,
 }
 
 
-static void _static_text_func(sqlite3_context *context, int argc,
-                              sqlite3_value **argv) {
+static void _static_text_func(sqlite3_context *context, int argc, sqlite3_value **argv) {
   UNUSED_PARAMETER(argc);
   UNUSED_PARAMETER(argv);
   sqlite3_result_text(context, sqlite3_user_data(context), -1, SQLITE_STATIC);
@@ -347,7 +346,7 @@ static void lembed_token_score(sqlite3_context *context, int argc,
 
   int32_t token = sqlite3_value_int(argv[1]);
 
-  float score = llama_token_get_score(model, token);
+  float score = llama_vocab_get_score(llama_model_get_vocab(model), token);
   sqlite3_result_double(context, score);
 }
 static void lembed_token_to_piece_(sqlite3_context *context, int argc,
@@ -360,7 +359,7 @@ static void lembed_token_to_piece_(sqlite3_context *context, int argc,
   int32_t token = sqlite3_value_int(argv[1]);
 #define BUFLEN 256
   char buf[BUFLEN];
-  int n = llama_token_to_piece(model, token, buf, BUFLEN, false);
+  int n = llama_token_to_piece(llama_model_get_vocab(model), token, buf, BUFLEN, 0, false);
   if (n) {
     sqlite3_result_text(context, buf, n, SQLITE_TRANSIENT);
   } else {
@@ -470,7 +469,7 @@ static int lembed_modelsUpdate(sqlite3_vtab *pVTab, int argc,
       mparams.n_gpu_layers = modelOptions->n_gpu_layers;
     }
 
-    model = llama_load_model_from_file(modelPath, mparams);
+    model = llama_model_load_from_file(modelPath, mparams);
     if (!model) {
       return SQLITE_ERROR;
     }
@@ -480,22 +479,19 @@ static int lembed_modelsUpdate(sqlite3_vtab *pVTab, int argc,
     cparams.embeddings = 1;
     if (contextOptions) {
       if (contextOptions->defined[0]) {
-        cparams.seed = contextOptions->seed;
-      }
-      if (contextOptions->defined[1]) {
         cparams.n_ctx = contextOptions->n_ctx;
       }
-      if (contextOptions->defined[2]) {
+      if (contextOptions->defined[1]) {
         cparams.rope_scaling_type = contextOptions->rope_scaling_type;
       }
-      if (contextOptions->defined[3]) {
+      if (contextOptions->defined[2]) {
         cparams.rope_freq_scale = contextOptions->rope_freq_scale;
       }
     }
 
-    ctx = llama_new_context_with_model(model, cparams);
+    ctx = llama_init_from_model(model, cparams);
     if (!ctx) {
-      llama_free_model(model);
+      llama_model_free(model);
       return SQLITE_ERROR;
     }
     p->api->models[idx].model = model;
@@ -742,7 +738,7 @@ static int lembed_chunksFilter(sqlite3_vtab_cursor *pVtabCursor, int idxNum,
     for (int j = 0; j < chunk_size; j++) {
       int32_t token = tokens[i * chunk_size + j];
       int32_t piece_len_neg =
-          llama_token_to_piece(model, token, NULL, 0, false);
+          llama_token_to_piece(llama_model_get_vocab(model), token, NULL, 0, 0, false);
       // printf("%d\n", piece_len_neg);
       // assert(piece_len_neg < 0);
       int32_t piece_len = abs(piece_len_neg);
@@ -753,7 +749,7 @@ static int lembed_chunksFilter(sqlite3_vtab_cursor *pVtabCursor, int idxNum,
 
       char *piece = sqlite3_malloc(piece_len);
       assert(piece);
-      llama_token_to_piece(model, token, piece, piece_len, false);
+      llama_token_to_piece(llama_model_get_vocab(model), token, piece, piece_len, 0, false);
       // printf("'%.*s' %d ", piece_len, piece, tokens[i*chunk_size + j]);
 
       char *begin = ptr;
@@ -918,4 +914,4 @@ __declspec(dllexport)
   sqlite3_create_module_v2(db, "lembed_chunks", &lembed_chunksModule, a, NULL);
   sqlite3_create_module_v2(db, "lembed_models", &lembed_modelsModule, a, NULL);
   return SQLITE_OK;
-}
+}
\ No newline at end of file
diff --git a/tests/test-loadable.py b/tests/test-loadable.py
index bf1166c..cdd873f 100644
--- a/tests/test-loadable.py
+++ b/tests/test-loadable.py
@@ -120,7 +120,7 @@ def test_lembed():
     a = lembed("aaa", "alex garcia")
     assert len(a) == (384 * 4)
     assert struct.unpack("1f", a[0:4])[0] == pytest.approx(
-        -0.09205757826566696, rel=1e-2
+        0.002983473241329193, rel=1e-2
     )
 
     with _raises(
@@ -138,7 +138,7 @@ def test_lembed():
     a = lembed("alex garcia")
     assert len(a) == (384 * 4)
     assert struct.unpack("1f", a[0:4])[0] == pytest.approx(
-        -0.09205757826566696, rel=1e-2
+        0.002983473241329193, rel=1e-2
     )
 
 
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 2b33896..4fd1242 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 2b3389677a833cee0880226533a1768b1a9508d2
+Subproject commit 4fd1242bef6cb2325b4ff1c1a80f3b54b64508a6