feat: Update llama.cpp

abetlen · abetlen · commit 7403e002b8e0 · 2024-10-22T09:33:48.000-04:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -55,6 +55,9 @@ if (LLAMA_BUILD)
     set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
     set(CMAKE_SKIP_RPATH FALSE)
 
+    # Enable building of the common library
+    set(LLAMA_BUILD_COMMON ON CACHE BOOL "Build llama.cpp common library" FORCE)
+
     # Building llama
     if (APPLE AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
         # Need to disable these llama.cpp flags on Apple x86_64,
@@ -106,7 +109,7 @@ if (LLAMA_BUILD)
         # Building llava
         add_subdirectory(vendor/llama.cpp/examples/llava)
         set_target_properties(llava_shared PROPERTIES OUTPUT_NAME "llava")
-        # Set CUDA_ARCHITECTURES to OFF on windows
+        # Set CUDA_ARCHITECTURES to OFF on Windows
         if (WIN32)
             set_target_properties(llava_shared PROPERTIES CUDA_ARCHITECTURES OFF)
         endif()
@@ -121,5 +124,18 @@ if (LLAMA_BUILD)
                 DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib
             )
         endif()
+
+        # Fix for llava build: Add include directory for llama.h
+        # Move these commands after the add_subdirectory call
+        target_include_directories(llava PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
+        target_include_directories(llava PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/ggml/include)
+
+        if (BUILD_SHARED_LIBS)
+            target_include_directories(llava_shared PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
+            target_include_directories(llava_shared PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/ggml/include)
+        endif()
+
+        target_include_directories(llama-llava-cli PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
+        target_include_directories(llama-minicpmv-cli PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
     endif()
 endif()
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
@@ -464,6 +464,8 @@ class llama_token_data(ctypes.Structure):
 
 
 # typedef struct llama_token_data_array {
+#     // TODO: consider SoA
+#     // NOTE: this pointer can be modified by the samplers
 #     llama_token_data * data;
 #     size_t size;
 #     int64_t selected; // this is the index in the data array (i.e. not the token id)
@@ -507,8 +509,11 @@ class llama_token_data_array(ctypes.Structure):
 # // - token  : the token ids of the input (used when embd is NULL)
 # // - embd   : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
 # // - pos    : the positions of the respective token in the sequence
+# //            (if set to NULL, the token position will be tracked automatically by llama_decode)
 # // - seq_id : the sequence to which the respective token belongs
+# //            (if set to NULL, the sequence ID will be assumed to be 0)
 # // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
+# //            (if set to NULL, only the logits for last token will be returned)
 # //
 # typedef struct llama_batch {
 #     int32_t n_tokens;
@@ -519,16 +524,6 @@ class llama_token_data_array(ctypes.Structure):
 #     int32_t      *  n_seq_id;
 #     llama_seq_id ** seq_id;
 #     int8_t       *  logits; // TODO: rename this to "output"
-
-
-#     // NOTE: helpers for smooth API transition - can be deprecated in the future
-#     //       for future-proof code, use the above fields instead and ignore everything below
-#     //
-#     // pos[i] = all_pos_0 + i*all_pos_1
-#     //
-#     llama_pos    all_pos_0;  // used if pos == NULL
-#     llama_pos    all_pos_1;  // used if pos == NULL
-#     llama_seq_id all_seq_id; // used if seq_id == NULL
 # } llama_batch;
 class llama_batch(ctypes.Structure):
     """Input data for llama_decode
@@ -563,9 +558,6 @@ class llama_batch(ctypes.Structure):
         ("n_seq_id", ctypes.POINTER(ctypes.c_int32)),
         ("seq_id", ctypes.POINTER(ctypes.POINTER(llama_seq_id))),
         ("logits", ctypes.POINTER(ctypes.c_int8)),
-        ("all_pos_0", llama_pos),
-        ("all_pos_1", llama_pos),
-        ("all_seq_id", llama_seq_id),
     ]
 
 
@@ -1170,6 +1162,12 @@ def llama_supports_gpu_offload() -> bool:
     ...
 
 
+# LLAMA_API bool llama_supports_rpc        (void);
+@ctypes_function("llama_supports_rpc", [], ctypes.c_bool)
+def llama_supports_rpc() -> bool:
+    ...
+
+
 # LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
 @ctypes_function("llama_n_ctx", [llama_context_p_ctypes], ctypes.c_uint32)
 def llama_n_ctx(ctx: llama_context_p, /) -> int:
@@ -2255,30 +2253,26 @@ def llama_state_seq_load_file(
 # //
 
 
-# // Return batch for single sequence of tokens starting at pos_0
+# // Return batch for single sequence of tokens
+# // The sequence ID will be fixed to 0
+# // The position of the tokens will be tracked automatically by llama_decode
 # //
 # // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
 # //
 # LLAMA_API struct llama_batch llama_batch_get_one(
 #               llama_token * tokens,
-#                   int32_t   n_tokens,
-#                 llama_pos   pos_0,
-#              llama_seq_id   seq_id);
+#                   int32_t   n_tokens);
 @ctypes_function(
     "llama_batch_get_one",
     [
         llama_token_p,
-        ctypes.c_int,
-        llama_pos,
-        llama_seq_id,
+        ctypes.c_int32,
     ],
     llama_batch,
 )
 def llama_batch_get_one(
     tokens: CtypesArray[llama_token],
     n_tokens: Union[ctypes.c_int, int],
-    pos_0: Union[llama_pos, int],
-    seq_id: llama_seq_id,
     /,
 ) -> llama_batch:
     """Return batch for single sequence of tokens starting at pos_0
@@ -2616,6 +2610,13 @@ def llama_token_eos(model: llama_model_p, /) -> int:
     ...
 
 
+# LLAMA_API llama_token llama_token_eot(const struct llama_model * model); // end-of-turn
+@ctypes_function("llama_token_eot", [llama_model_p_ctypes], llama_token)
+def llama_token_eot(model: llama_model_p, /) -> int:
+    """end-of-turn"""
+    ...
+
+
 # LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
 @ctypes_function("llama_token_cls", [llama_model_p_ctypes], llama_token)
 def llama_token_cls(model: llama_model_p, /) -> int:
@@ -2650,30 +2651,54 @@ def llama_add_eos_token(model: llama_model_p, /) -> bool:
 
 
 # // Codellama infill tokens
-# LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
+# DEPRECATED(LLAMA_API llama_token llama_token_prefix(const struct llama_model * model), "use llama_token_fim_pre instead");
 @ctypes_function("llama_token_prefix", [llama_model_p_ctypes], llama_token)
 def llama_token_prefix(model: llama_model_p) -> int:
     """codellama infill tokens"""
     ...
 
 
-# LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
+# DEPRECATED(LLAMA_API llama_token llama_token_middle(const struct llama_model * model), "use llama_token_fim_mid instead");
 @ctypes_function("llama_token_middle", [llama_model_p_ctypes], llama_token)
 def llama_token_middle(model: llama_model_p, /) -> int:
     ...
 
 
-# LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
+# DEPRECATED(LLAMA_API llama_token llama_token_suffix(const struct llama_model * model), "use llama_token_fim_suf instead");
 @ctypes_function("llama_token_suffix", [llama_model_p_ctypes], llama_token)
 def llama_token_suffix(model: llama_model_p, /) -> int:
     ...
 
 
-# LLAMA_API llama_token llama_token_eot   (const struct llama_model * model); // End of infill middle
-@ctypes_function("llama_token_eot", [llama_model_p_ctypes], llama_token)
-def llama_token_eot(model: llama_model_p, /) -> int:
+# LLAMA_API llama_token llama_token_fim_pre(const struct llama_model * model);
+@ctypes_function("llama_token_fim_pre", [llama_model_p_ctypes], llama_token)
+def llama_token_fim_pre(model: llama_model_p, /) -> int:
+    ...
+
+# LLAMA_API llama_token llama_token_fim_suf(const struct llama_model * model);
+@ctypes_function("llama_token_fim_suf", [llama_model_p_ctypes], llama_token)
+def llama_token_fim_suf(model: llama_model_p, /) -> int:
     ...
 
+# LLAMA_API llama_token llama_token_fim_mid(const struct llama_model * model);
+@ctypes_function("llama_token_fim_mid", [llama_model_p_ctypes], llama_token)
+def llama_token_fim_mid(model: llama_model_p, /) -> int:
+    ...
+
+# LLAMA_API llama_token llama_token_fim_pad(const struct llama_model * model);
+@ctypes_function("llama_token_fim_pad", [llama_model_p_ctypes], llama_token)
+def llama_token_fim_pad(model: llama_model_p, /) -> int:
+    ...
+
+# LLAMA_API llama_token llama_token_fim_rep(const struct llama_model * model);
+@ctypes_function("llama_token_fim_rep", [llama_model_p_ctypes], llama_token)
+def llama_token_fim_rep(model: llama_model_p, /) -> int:
+    ...
+
+# LLAMA_API llama_token llama_token_fim_sep(const struct llama_model * model);
+@ctypes_function("llama_token_fim_sep", [llama_model_p_ctypes], llama_token)
+def llama_token_fim_sep(model: llama_model_p, /) -> int:
+    ...
 
 # //
 # // Tokenization
@@ -2786,6 +2811,23 @@ def llama_token_to_piece(
     ...
 
 
+# # // check if token0 is contained as a prefix in token1
+# # LLAMA_API bool llama_token_is_prefix(
+# #           const struct llama_model * model,
+# #                        llama_token   token0,
+# #                        llama_token   token1);
+# @ctypes_function(
+#     "llama_token_is_prefix",
+#     [llama_model_p_ctypes, llama_token, llama_token],
+#     ctypes.c_bool,
+# )
+# def llama_token_is_prefix(
+#     model: llama_model_p, token0: Union[llama_token, int], token1: Union[llama_token, int], /
+# ) -> bool:
+#     """Check if token0 is contained as a prefix in token1"""
+#     ...
+
+
 # /// @details Convert the provided tokens into text (inverse of llama_tokenize()).
 # /// @param text The char pointer must be large enough to hold the resulting text.
 # /// @return Returns the number of chars/bytes on success, no more than text_len_max.
@@ -3099,20 +3141,22 @@ def llama_sampler_chain_remove(
 
 # // available samplers:
 #
-# LLAMA_API struct llama_sampler * llama_sampler_init_greedy     (void);
+# LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
 @ctypes_function("llama_sampler_init_greedy", [], llama_sampler_p_ctypes)
 def llama_sampler_init_greedy() -> llama_sampler_p:
     ...
 
 
-# LLAMA_API struct llama_sampler * llama_sampler_init_dist       (uint32_t seed);
+# LLAMA_API struct llama_sampler * llama_sampler_init_dist  (uint32_t seed);
 @ctypes_function("llama_sampler_init_dist", [ctypes.c_uint32], llama_sampler_p_ctypes)
 def llama_sampler_init_dist(seed: int) -> llama_sampler_p:
     ...
 
 
 # /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
-# LLAMA_API struct llama_sampler * llama_sampler_init_softmax    (void);
+# /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
+# DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax    (void),
+#     "will be removed in the future (see https://github.com/ggerganov/llama.cpp/pull/9896#discussion_r1800920915)");
 @ctypes_function("llama_sampler_init_softmax", [], llama_sampler_p_ctypes)
 def llama_sampler_init_softmax() -> llama_sampler_p:
     ...
@@ -3188,6 +3232,19 @@ def llama_sampler_init_temp_ext(
     ...
 
 
+# /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
+# LLAMA_API struct llama_sampler * llama_sampler_init_xtc        (float   p, float   t,     size_t min_keep, uint32_t seed);
+@ctypes_function(
+    "llama_sampler_init_xtc",
+    [ctypes.c_float, ctypes.c_float, ctypes.c_size_t, ctypes.c_uint32],
+    llama_sampler_p_ctypes,
+)
+def llama_sampler_init_xtc(
+    p: float, t: float, min_keep: int, seed: int, /
+) -> llama_sampler_p:
+    ...
+
+
 # /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
 # /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
 # /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@@ -3301,6 +3358,39 @@ def llama_sampler_init_logit_bias(
     ...
 
 
+# // this sampler is meant to be used for fill-in-the-middle infilling
+# // it's supposed to be used after top_k + top_p sampling
+# //
+# // 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG
+# // 2. combine probs of tokens that have the same prefix
+# //
+# // example:
+# //
+# // - before:
+# //   "hel":   0.5
+# //   "hell":  0.2
+# //   "hello": 0.1
+# //   "dummy": 0.1
+# //
+# // - after:
+# //   "hel":   0.8
+# //   "dummy": 0.1
+# //
+# // 3. discard non-EOG tokens with low prob
+# // 4. if no tokens are left -> pick EOT
+# //
+# LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model);
+@ctypes_function(
+    "llama_sampler_init_infill",
+    [llama_model_p_ctypes],
+    llama_sampler_p_ctypes,
+)
+def llama_sampler_init_infill(model: llama_model_p, /) -> llama_sampler_p:
+    """This sampler is meant to be used for fill-in-the-middle infilling.
+    """
+    ...
+
+
 # // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
 # LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
 @ctypes_function(
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit c919d5db39c8a7fcb64737f008e4b105ee0acd20
+Subproject commit c421ac072d46172ab18924e1e8be53680b54ed3b