@@ -175,8 +175,8 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
175175
176176# define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
177177LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
178- # define LLAMA_SESSION_VERSION 4
179- LLAMA_SESSION_VERSION = 4
178+ # define LLAMA_SESSION_VERSION 5
179+ LLAMA_SESSION_VERSION = 5
180180
181181
182182# struct llama_model;
@@ -274,6 +274,7 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
274274# LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors
275275# LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
276276# LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
277+ # LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
277278
278279# LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
279280# };
@@ -677,6 +678,7 @@ class llama_context_params(ctypes.Structure):
677678# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
678679# bool pure; // quantize all tensors to the default type
679680# void * imatrix; // pointer to importance matrix data
681+ # void * kv_overrides; // pointer to vector containing overrides
680682# } llama_model_quantize_params;
681683class llama_model_quantize_params (ctypes .Structure ):
682684 """Parameters for llama_model_quantize
@@ -691,6 +693,7 @@ class llama_model_quantize_params(ctypes.Structure):
691693 only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
692694 pure (bool): quantize all tensors to the default type
693695 imatrix (ctypes.c_void_p): pointer to importance matrix data
696+ kv_overrides (ctypes.c_void_p): pointer to vector containing overrides
694697 """
695698
696699 _fields_ = [
@@ -703,6 +706,7 @@ class llama_model_quantize_params(ctypes.Structure):
703706 ("only_copy" , ctypes .c_bool ),
704707 ("pure" , ctypes .c_bool ),
705708 ("imatrix" , ctypes .c_void_p ),
709+ ("kv_overrides" , ctypes .c_void_p ),
706710 ]
707711
708712
@@ -1838,9 +1842,9 @@ def llama_synchronize(ctx: llama_context_p, /):
18381842
18391843
18401844# // Token logits obtained from the last call to llama_decode()
1841- # // The logits for the last token are stored in the last row
1842- # // Logits for which llama_batch.logits[i] == 0 are undefined
1843- # // Rows: n_tokens provided with llama_batch
1845+ # // The logits for which llama_batch.logits[i] != 0 are stored contiguously
1846+ # // in the order they have appeared in the batch.
1847+ # // Rows: number of tokens for which llama_batch.logits[i] != 0
18441848# // Cols: n_vocab
18451849# LLAMA_API float * llama_get_logits(struct llama_context * ctx);
18461850@ctypes_function (
@@ -1859,7 +1863,8 @@ def llama_get_logits(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]:
18591863
18601864
18611865# // Logits for the ith token. Equivalent to:
1862- # // llama_get_logits(ctx) + i*n_vocab
1866+ # // llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
1867+ # // returns NULL for invalid ids.
18631868# LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
18641869@ctypes_function (
18651870 "llama_get_logits_ith" ,
@@ -1874,8 +1879,12 @@ def llama_get_logits_ith(
18741879 ...
18751880
18761881
1877- # // Get all output token embeddings
1878- # // shape: [n_tokens*n_embd] (1-dimensional)
1882+ # // Get all output token embeddings.
1883+ # // when pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model,
1884+ # // the embeddings for which llama_batch.logits[i] != 0 are stored contiguously
1885+ # // in the order they have appeared in the batch.
1886+ # // shape: [n_outputs*n_embd]
1887+ # // Otherwise, returns NULL.
18791888# LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
18801889@ctypes_function (
18811890 "llama_get_embeddings" , [llama_context_p_ctypes ], ctypes .POINTER (ctypes .c_float )
@@ -1886,9 +1895,10 @@ def llama_get_embeddings(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]
18861895 ...
18871896
18881897
1889- # // Get the embeddings for the ith token
1890- # // llama_get_embeddings(ctx) + i *n_embd
1898+ # // Get the embeddings for the ith token. Equivalent to:
1899+ # // llama_get_embeddings(ctx) + ctx->output_ids[i] *n_embd
18911900# // shape: [n_embd] (1-dimensional)
1901+ # // returns NULL for invalid ids.
18921902# LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
18931903@ctypes_function (
18941904 "llama_get_embeddings_ith" ,
0 commit comments