|
220 | 220 | # LLAMA_VOCAB_PRE_TYPE_BLOOM = 23, |
221 | 221 | # LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24, |
222 | 222 | # LLAMA_VOCAB_PRE_TYPE_EXAONE = 25, |
| 223 | +# LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26, |
223 | 224 | # }; |
224 | 225 | LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0 |
225 | 226 | LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1 |
|
247 | 248 | LLAMA_VOCAB_PRE_TYPE_BLOOM = 23 |
248 | 249 | LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24 |
249 | 250 | LLAMA_VOCAB_PRE_TYPE_EXAONE = 25 |
| 251 | +LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26 |
250 | 252 |
|
251 | 253 |
|
252 | 254 | # // note: these values should be synchronized with ggml_rope |
|
404 | 406 | # LLAMA_POOLING_TYPE_MEAN = 1, |
405 | 407 | # LLAMA_POOLING_TYPE_CLS = 2, |
406 | 408 | # LLAMA_POOLING_TYPE_LAST = 3, |
| 409 | +# LLAMA_POOLING_TYPE_RANK = 4, // used by reranking models to attach the classification head to the graph |
407 | 410 | # }; |
408 | 411 | LLAMA_POOLING_TYPE_UNSPECIFIED = -1 |
409 | 412 | LLAMA_POOLING_TYPE_NONE = 0 |
410 | 413 | LLAMA_POOLING_TYPE_MEAN = 1 |
411 | 414 | LLAMA_POOLING_TYPE_CLS = 2 |
412 | 415 | LLAMA_POOLING_TYPE_LAST = 3 |
| 416 | +LLAMA_POOLING_TYPE_RANK = 4 |
413 | 417 |
|
414 | 418 | # enum llama_attention_type { |
415 | 419 | # LLAMA_ATTENTION_TYPE_UNSPECIFIED = -1, |
|
420 | 424 | LLAMA_ATTENTION_TYPE_CAUSAL = 0 |
421 | 425 | LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1 |
422 | 426 |
|
| 427 | + |
423 | 428 | # enum llama_split_mode { |
424 | | -# LLAMA_SPLIT_MODE_NONE = 0, // single GPU |
425 | | -# LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs |
426 | | -# LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs |
| 429 | +# LLAMA_SPLIT_MODE_NONE = 0, // single GPU |
| 430 | +# LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs |
| 431 | +# LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs |
427 | 432 | # }; |
428 | 433 | LLAMA_SPLIT_MODE_NONE = 0 |
429 | 434 | LLAMA_SPLIT_MODE_LAYER = 1 |
@@ -2520,7 +2525,8 @@ def llama_get_embeddings_ith( |
2520 | 2525 |
|
2521 | 2526 | # // Get the embeddings for a sequence id |
2522 | 2527 | # // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE |
2523 | | -# // shape: [n_embd] (1-dimensional) |
| 2528 | +# // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[1] with the rank of the sequence |
| 2529 | +# // otherwise: float[n_embd] (1-dimensional) |
2524 | 2530 | # LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id); |
2525 | 2531 | @ctypes_function( |
2526 | 2532 | "llama_get_embeddings_seq", |
@@ -2672,6 +2678,8 @@ def llama_token_eot(model: llama_model_p, /) -> int: |
2672 | 2678 | # // |
2673 | 2679 | # // Tokenization |
2674 | 2680 | # // |
| 2681 | +# // The API is thread-safe. |
| 2682 | +# // |
2675 | 2683 |
|
2676 | 2684 |
|
2677 | 2685 | # /// @details Convert the provided text into tokens. |
|
0 commit comments