@@ -333,7 +333,7 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
333333LLAMA_ROPE_TYPE_GLM = 4
334334
335335
336- # enum llama_token_type {
336+ # enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
337337# LLAMA_TOKEN_TYPE_UNDEFINED = 0,
338338# LLAMA_TOKEN_TYPE_NORMAL = 1,
339339# LLAMA_TOKEN_TYPE_UNKNOWN = 2,
@@ -351,6 +351,32 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
351351LLAMA_TOKEN_TYPE_BYTE = 6
352352
353353
354+ # enum llama_token_attr {
355+ # LLAMA_TOKEN_ATTR_UNDEFINED = 0,
356+ # LLAMA_TOKEN_ATTR_UNKNOWN = 1 << 0,
357+ # LLAMA_TOKEN_ATTR_UNUSED = 1 << 1,
358+ # LLAMA_TOKEN_ATTR_NORMAL = 1 << 2,
359+ # LLAMA_TOKEN_ATTR_CONTROL = 1 << 3, // SPECIAL?
360+ # LLAMA_TOKEN_ATTR_USER_DEFINED = 1 << 4,
361+ # LLAMA_TOKEN_ATTR_BYTE = 1 << 5,
362+ # LLAMA_TOKEN_ATTR_NORMALIZED = 1 << 6,
363+ # LLAMA_TOKEN_ATTR_LSTRIP = 1 << 7,
364+ # LLAMA_TOKEN_ATTR_RSTRIP = 1 << 8,
365+ # LLAMA_TOKEN_ATTR_SINGLE_WORD = 1 << 9,
366+ # };
367+ LLAMA_TOKEN_ATTR_UNDEFINED = 0
368+ LLAMA_TOKEN_ATTR_UNKNOWN = 1 << 0
369+ LLAMA_TOKEN_ATTR_UNUSED = 1 << 1
370+ LLAMA_TOKEN_ATTR_NORMAL = 1 << 2
371+ LLAMA_TOKEN_ATTR_CONTROL = 1 << 3
372+ LLAMA_TOKEN_ATTR_USER_DEFINED = 1 << 4
373+ LLAMA_TOKEN_ATTR_BYTE = 1 << 5
374+ LLAMA_TOKEN_ATTR_NORMALIZED = 1 << 6
375+ LLAMA_TOKEN_ATTR_LSTRIP = 1 << 7
376+ LLAMA_TOKEN_ATTR_RSTRIP = 1 << 8
377+ LLAMA_TOKEN_ATTR_SINGLE_WORD = 1 << 9
378+
379+
354380# // model file types
355381# enum llama_ftype {
356382# LLAMA_FTYPE_ALL_F32 = 0,
@@ -959,6 +985,9 @@ class llama_model_quantize_params(ctypes.Structure):
959985# // modifies a preceding LLAMA_GRETYPE_CHAR or
960986# // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
961987# LLAMA_GRETYPE_CHAR_ALT = 6,
988+
989+ # // any character (.)
990+ # LLAMA_GRETYPE_CHAR_ANY = 7,
962991# };
963992LLAMA_GRETYPE_END = 0
964993LLAMA_GRETYPE_ALT = 1
@@ -967,6 +996,7 @@ class llama_model_quantize_params(ctypes.Structure):
967996LLAMA_GRETYPE_CHAR_NOT = 4
968997LLAMA_GRETYPE_CHAR_RNG_UPPER = 5
969998LLAMA_GRETYPE_CHAR_ALT = 6
999+ LLAMA_GRETYPE_CHAR_ANY = 7
9701000
9711001
9721002# typedef struct llama_grammar_element {
@@ -2438,11 +2468,11 @@ def llama_token_get_score(
24382468) -> float : ...
24392469
24402470
2441- # LLAMA_API enum llama_token_type llama_token_get_type (const struct llama_model * model, llama_token token);
2471+ # LLAMA_API enum llama_token_attr llama_token_get_attr (const struct llama_model * model, llama_token token);
24422472@ctypes_function (
2443- "llama_token_get_type " , [llama_model_p_ctypes , llama_token ], ctypes .c_int
2473+ "llama_token_get_attr " , [llama_model_p_ctypes , llama_token ], ctypes .c_int
24442474)
2445- def llama_token_get_type (
2475+ def llama_token_get_attr (
24462476 model : llama_model_p , token : Union [llama_token , int ], /
24472477) -> int : ...
24482478
@@ -3200,104 +3230,9 @@ def llama_grammar_accept_token(
32003230
32013231
32023232# //
3203- # // Beam search
3233+ # // Model split
32043234# //
32053235
3206- # struct llama_beam_view {
3207- # const llama_token * tokens;
3208-
3209-
3210- # size_t n_tokens;
3211- # float p; // Cumulative beam probability (renormalized relative to all beams)
3212- # bool eob; // Callback should set this to true when a beam is at end-of-beam.
3213- # };
3214- class llama_beam_view (ctypes .Structure ):
3215- if TYPE_CHECKING :
3216- tokens : CtypesArray [llama_token ]
3217- n_tokens : int
3218- p : float
3219- eob : bool
3220-
3221- _fields_ = [
3222- ("tokens" , llama_token_p ),
3223- ("n_tokens" , ctypes .c_size_t ),
3224- ("p" , ctypes .c_float ),
3225- ("eob" , ctypes .c_bool ),
3226- ]
3227-
3228-
3229- # // Passed to beam_search_callback function.
3230- # // Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
3231- # // (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
3232- # // These pointers are valid only during the synchronous callback, so should not be saved.
3233- # struct llama_beams_state {
3234- # struct llama_beam_view * beam_views;
3235- # size_t n_beams; // Number of elements in beam_views[].
3236- # size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
3237- # bool last_call; // True iff this is the last callback invocation.
3238- # };
3239- class llama_beams_state (ctypes .Structure ):
3240- if TYPE_CHECKING :
3241- beam_views : CtypesArray [llama_beam_view ]
3242- n_beams : int
3243- common_prefix_length : int
3244- last_call : bool
3245-
3246- _fields_ = [
3247- ("beam_views" , ctypes .POINTER (llama_beam_view )),
3248- ("n_beams" , ctypes .c_size_t ),
3249- ("common_prefix_length" , ctypes .c_size_t ),
3250- ("last_call" , ctypes .c_bool ),
3251- ]
3252-
3253-
3254- # // Type of pointer to the beam_search_callback function.
3255- # // void* callback_data is any custom data passed to llama_beam_search, that is subsequently
3256- # // passed back to beam_search_callback. This avoids having to use global variables in the callback.
3257- # typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state);
3258- llama_beam_search_callback_fn_t = ctypes .CFUNCTYPE (
3259- None , ctypes .c_void_p , llama_beams_state
3260- )
3261-
3262-
3263- # /// @details Deterministically returns entire sentence constructed by a beam search.
3264- # /// @param ctx Pointer to the llama_context.
3265- # /// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
3266- # /// @param callback_data A pointer that is simply passed back to callback.
3267- # /// @param n_beams Number of beams to use.
3268- # /// @param n_past Number of tokens already evaluated.
3269- # /// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
3270- # /// @param n_threads Number of threads as passed to llama_eval().
3271- # LLAMA_API void llama_beam_search(
3272- # struct llama_context * ctx,
3273- # llama_beam_search_callback_fn_t callback,
3274- # void * callback_data,
3275- # size_t n_beams,
3276- # int32_t n_past,
3277- # int32_t n_predict);
3278- @ctypes_function (
3279- "llama_beam_search" ,
3280- [
3281- llama_context_p_ctypes ,
3282- llama_beam_search_callback_fn_t ,
3283- ctypes .c_void_p ,
3284- ctypes .c_size_t ,
3285- ctypes .c_int32 ,
3286- ctypes .c_int32 ,
3287- ],
3288- None ,
3289- )
3290- def llama_beam_search (
3291- ctx : llama_context_p ,
3292- callback : CtypesFuncPointer ,
3293- callback_data : ctypes .c_void_p ,
3294- n_beams : Union [ctypes .c_size_t , int ],
3295- n_past : Union [ctypes .c_int , int ],
3296- n_predict : Union [ctypes .c_int , int ],
3297- / ,
3298- ): ...
3299-
3300-
33013236# /// @details Build a split GGUF final path for this chunk.
33023237# /// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
33033238# // Returns the split_path length.
0 commit comments