Skip to content

Commit 8c1b771

Browse files
committed
port kv_cache to new memory
1 parent 91503a2 commit 8c1b771

File tree

4 files changed

+23
-7
lines changed

4 files changed

+23
-7
lines changed

examples/notebooks/Batching.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,7 @@
230230
"outputs": [],
231231
"source": [
232232
"for i in range(n_parallel):\n",
233-
" llama_cpp.llama_kv_cache_seq_cp(ctx, 0, i, 0, batch.n_tokens)"
233+
" llama_cpp.llama_kv_self_seq_cp(ctx, 0, i, 0, batch.n_tokens)"
234234
]
235235
},
236236
{

llama_cpp/_ctypes_extensions.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
import ctypes
66
import functools
77
import pathlib
8+
import logging
9+
import traceback
810

911
from typing import (
1012
Any,
@@ -18,6 +20,9 @@
1820
)
1921
from typing_extensions import TypeAlias
2022

23+
# Configure logging
24+
logging.basicConfig(level=logging.INFO)
25+
logger = logging.getLogger("llama_cpp.binding")
2126

2227
# Load the library
2328
def load_shared_library(lib_base_name: str, base_path: pathlib.Path):
@@ -110,11 +115,21 @@ def ctypes_function(
110115
):
111116
def decorator(f: F) -> F:
112117
if enabled:
118+
print(f"Setting up binding for C function: {name}") # Print when binding is created
113119
func = getattr(lib, name)
114120
func.argtypes = argtypes
115121
func.restype = restype
116-
functools.wraps(f)(func)
117-
return func
122+
123+
@functools.wraps(f)
124+
def wrapper(*args, **kwargs):
125+
print(f">>> Calling {name} with args: {args}") # Print right before C call
126+
sys.stdout.flush() # Force flush to ensure we see the output
127+
result = func(*args, **kwargs)
128+
print(f"<<< {name} returned successfully") # Print after successful return
129+
sys.stdout.flush()
130+
return result
131+
132+
return wrapper
118133
else:
119134
return f
120135

llama_cpp/llama.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -578,7 +578,7 @@ def eval(self, tokens: Sequence[int]):
578578
Args:
579579
tokens: The list of tokens to evaluate.
580580
"""
581-
self._ctx.kv_cache_seq_rm(-1, self.n_tokens, -1)
581+
self._ctx.kv_self_seq_rm(-1, self.n_tokens, -1)
582582
for i in range(0, len(tokens), self.n_batch):
583583
batch = tokens[i : min(len(tokens), i + self.n_batch)]
584584
n_past = self.n_tokens
@@ -886,7 +886,7 @@ def generate(
886886

887887
if sample_idx < self.n_tokens and token != self._input_ids[sample_idx]:
888888
self.n_tokens = sample_idx
889-
self._ctx.kv_cache_seq_rm(-1, self.n_tokens, -1)
889+
self._ctx.kv_self_seq_rm(-1, self.n_tokens, -1)
890890
break
891891

892892
if self.draft_model is not None:

llama_cpp/llama_cpp.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -798,7 +798,8 @@ class llama_model_params(ctypes.Structure):
798798
# // ref: https://github.com/ggml-org/llama.cpp/pull/14363
799799
# };
800800
class llama_context_params(ctypes.Structure):
801-
"""Parameters for llama_context
801+
"""Parameters for llama_context. NOTE: changing the default values of parameters marked as [EXPERIMENTAL]
802+
may cause crashes or incorrect results in certain configurations.
802803
803804
Attributes:
804805
n_ctx (int): text context, 0 = from model
@@ -808,7 +809,7 @@ class llama_context_params(ctypes.Structure):
808809
n_threads (int): number of threads to use for generation
809810
n_threads_batch (int): number of threads to use for batch processing
810811
rope_scaling_type (int): RoPE scaling type, from `enum llama_rope_scaling_type`
811-
pooling_type (int): whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
812+
pooling_type (int): whether to pool (sum) embedding results by sequence id
812813
attention_type (int): attention type to use for embeddings
813814
rope_freq_base (float): RoPE base frequency, 0 = from model
814815
rope_freq_scale (float): RoPE frequency scaling factor, 0 = from model

0 commit comments

Comments
 (0)