port kv_cache to new memory

okaris · okaris · commit 8c1b77131f6c · 2025-09-06T07:13:02.000Z
diff --git a/examples/notebooks/Batching.ipynb b/examples/notebooks/Batching.ipynb
@@ -230,7 +230,7 @@
    "outputs": [],
    "source": [
     "for i in range(n_parallel):\n",
-    "    llama_cpp.llama_kv_cache_seq_cp(ctx, 0, i, 0, batch.n_tokens)"
+    "    llama_cpp.llama_kv_self_seq_cp(ctx, 0, i, 0, batch.n_tokens)"
    ]
   },
   {
diff --git a/llama_cpp/_ctypes_extensions.py b/llama_cpp/_ctypes_extensions.py
@@ -5,6 +5,8 @@
 import ctypes
 import functools
 import pathlib
+import logging
+import traceback
 
 from typing import (
     Any,
@@ -18,6 +20,9 @@
 )
 from typing_extensions import TypeAlias
 
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("llama_cpp.binding")
 
 # Load the library
 def load_shared_library(lib_base_name: str, base_path: pathlib.Path):
@@ -110,11 +115,21 @@ def ctypes_function(
     ):
         def decorator(f: F) -> F:
             if enabled:
+                print(f"Setting up binding for C function: {name}")  # Print when binding is created
                 func = getattr(lib, name)
                 func.argtypes = argtypes
                 func.restype = restype
-                functools.wraps(f)(func)
-                return func
+
+                @functools.wraps(f)
+                def wrapper(*args, **kwargs):
+                    print(f">>> Calling {name} with args: {args}")  # Print right before C call
+                    sys.stdout.flush()  # Force flush to ensure we see the output
+                    result = func(*args, **kwargs)
+                    print(f"<<< {name} returned successfully")  # Print after successful return
+                    sys.stdout.flush()
+                    return result
+
+                return wrapper
             else:
                 return f
 
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -578,7 +578,7 @@ def eval(self, tokens: Sequence[int]):
         Args:
             tokens: The list of tokens to evaluate.
         """
-        self._ctx.kv_cache_seq_rm(-1, self.n_tokens, -1)
+        self._ctx.kv_self_seq_rm(-1, self.n_tokens, -1)
         for i in range(0, len(tokens), self.n_batch):
             batch = tokens[i : min(len(tokens), i + self.n_batch)]
             n_past = self.n_tokens
@@ -886,7 +886,7 @@ def generate(
 
                 if sample_idx < self.n_tokens and token != self._input_ids[sample_idx]:
                     self.n_tokens = sample_idx
-                    self._ctx.kv_cache_seq_rm(-1, self.n_tokens, -1)
+                    self._ctx.kv_self_seq_rm(-1, self.n_tokens, -1)
                     break
 
             if self.draft_model is not None:
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
@@ -798,7 +798,8 @@ class llama_model_params(ctypes.Structure):
 #                       // ref: https://github.com/ggml-org/llama.cpp/pull/14363
 # };
 class llama_context_params(ctypes.Structure):
-    """Parameters for llama_context
+    """Parameters for llama_context. NOTE: changing the default values of parameters marked as [EXPERIMENTAL] 
+    may cause crashes or incorrect results in certain configurations.
 
     Attributes:
         n_ctx (int): text context, 0 = from model
@@ -808,7 +809,7 @@ class llama_context_params(ctypes.Structure):
         n_threads (int): number of threads to use for generation
         n_threads_batch (int): number of threads to use for batch processing
         rope_scaling_type (int): RoPE scaling type, from `enum llama_rope_scaling_type`
-        pooling_type (int): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
+        pooling_type (int): whether to pool (sum) embedding results by sequence id
         attention_type (int): attention type to use for embeddings
         rope_freq_base (float): RoPE base frequency, 0 = from model
         rope_freq_scale (float): RoPE frequency scaling factor, 0 = from model

Original file line number	Diff line number	Diff line change
`@@ -230,7 +230,7 @@`
`230`	`230`	`"outputs": [],`
`231`	`231`	`"source": [`
`232`	`232`	`"for i in range(n_parallel):\n",`
`233`		`- " llama_cpp.llama_kv_cache_seq_cp(ctx, 0, i, 0, batch.n_tokens)"`
	`233`	`+ " llama_cpp.llama_kv_self_seq_cp(ctx, 0, i, 0, batch.n_tokens)"`
`234`	`234`	`]`
`235`	`235`	`},`
`236`	`236`	`{`