235235# LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
236236# LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
237237# LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
238+ # LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
239+ # LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
238240# };
239241LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0
240242LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1
252254LLAMA_VOCAB_PRE_TYPE_DBRX = 13
253255LLAMA_VOCAB_PRE_TYPE_SMAUG = 14
254256LLAMA_VOCAB_PRE_TYPE_PORO = 15
255- LLAMA_VOCAV_PRE_TYPE_CHATGLM3 = 16
257+ LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16
256258LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17
257259LLAMA_VOCAB_PRE_TYPE_VIKING = 18
258260LLAMA_VOCAB_PRE_TYPE_JAIS = 19
269271LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30
270272LLAMA_VOCAB_PRE_TYPE_TRILLION = 31
271273LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32
274+ LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33
275+ LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34
272276
273277
274278# // note: these values should be synchronized with ggml_rope
@@ -891,17 +895,18 @@ class llama_context_params(ctypes.Structure):
891895
892896# // model quantization parameters
893897# typedef struct llama_model_quantize_params {
894- # int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
895- # enum llama_ftype ftype; // quantize to this llama_ftype
896- # enum ggml_type output_tensor_type; // output tensor type
897- # enum ggml_type token_embedding_type; // token embeddings tensor type
898- # bool allow_requantize; // allow quantizing non-f32/f16 tensors
899- # bool quantize_output_tensor; // quantize output.weight
900- # bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
901- # bool pure; // quantize all tensors to the default type
902- # bool keep_split; // quantize to the same number of shards
903- # void * imatrix; // pointer to importance matrix data
904- # void * kv_overrides; // pointer to vector containing overrides
898+ # int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
899+ # enum llama_ftype ftype; // quantize to this llama_ftype
900+ # enum ggml_type output_tensor_type; // output tensor type
901+ # enum ggml_type token_embedding_type; // token embeddings tensor type
902+ # bool allow_requantize; // allow quantizing non-f32/f16 tensors
903+ # bool quantize_output_tensor; // quantize output.weight
904+ # bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
905+ # bool pure; // quantize all tensors to the default type
906+ # bool keep_split; // quantize to the same number of shards
907+ # void * imatrix; // pointer to importance matrix data
908+ # void * kv_overrides; // pointer to vector containing overrides
909+ # void * tensor_types; // pointer to vector containing tensor types
905910# } llama_model_quantize_params;
906911class llama_model_quantize_params (ctypes .Structure ):
907912 """Parameters for llama_model_quantize
@@ -918,6 +923,7 @@ class llama_model_quantize_params(ctypes.Structure):
918923 keep_split (bool): quantize to the same number of shards
919924 imatrix (ctypes.c_void_p): pointer to importance matrix data
920925 kv_overrides (ctypes.c_void_p): pointer to vector containing overrides
926+ tensor_types (ctypes.c_void_p): pointer to vector containing tensor types
921927 """
922928
923929 if TYPE_CHECKING :
@@ -932,6 +938,7 @@ class llama_model_quantize_params(ctypes.Structure):
932938 keep_split : bool
933939 imatrix : ctypes .c_void_p
934940 kv_overrides : ctypes .c_void_p
941+ tensor_types : ctypes .c_void_p
935942
936943 _fields_ = [
937944 ("nthread" , ctypes .c_int32 ),
@@ -945,6 +952,7 @@ class llama_model_quantize_params(ctypes.Structure):
945952 ("keep_split" , ctypes .c_bool ),
946953 ("imatrix" , ctypes .c_void_p ),
947954 ("kv_overrides" , ctypes .c_void_p ),
955+ ("tensor_types" , ctypes .c_void_p ),
948956 ]
949957
950958
@@ -3812,6 +3820,7 @@ def llama_sampler_init_softmax() -> llama_sampler_p:
38123820
38133821
38143822# /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
3823+ # /// Setting k <= 0 makes this a noop
38153824# LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
38163825@ctypes_function ("llama_sampler_init_top_k" , [ctypes .c_int32 ], llama_sampler_p_ctypes )
38173826def llama_sampler_init_top_k (k : int ) -> llama_sampler_p :
0 commit comments