Implement LlamaTrieCache into llama_cache.py: Optimize LlamaCache lookup from O(N) to O(K) using a Trie

JamePeng · JamePeng · commit 2419dc2d9bb0 · 2025-11-02T10:12:36.000+08:00
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -34,9 +34,10 @@
 from .llama_grammar import LlamaGrammar
 from .llama_cache import (
     BaseLlamaCache,
-    LlamaCache,  # type: ignore
+    LlamaCache,      # type: ignore
     LlamaDiskCache,  # type: ignore
-    LlamaRAMCache,  # type: ignore
+    LlamaRAMCache,   # type: ignore
+    LlamaTrieCache,  # type: ignore
 )
 from .llama_tokenizer import BaseLlamaTokenizer, LlamaTokenizer
 import llama_cpp.llama_cpp as llama_cpp
diff --git a/llama_cpp/llama_cache.py b/llama_cpp/llama_cache.py
@@ -97,6 +97,164 @@ def __setitem__(self, key: Sequence[int], value: "llama_cpp.llama.LlamaState"):
             self.cache_state.popitem(last=False)
 
 
+class TrieNode:
+    """A node in the prefix tree (Trie)."""
+    def __init__(self):
+        # Child nodes: {token_id: TrieNode}
+        self.children: Dict[int, "TrieNode"] = {}
+        # Stores the LlamaState if this node marks the end of a cached sequence.
+        self.state: Optional["llama_cpp.llama.LlamaState"] = None
+
+
+class LlamaTrieCache(BaseLlamaCache):
+    """
+    A Llama cache implementation using a Trie for O(K) prefix lookup
+    and an OrderedDict for O(1) LRU eviction.
+
+    - K = length of the query key (number of tokens)
+    - N = total number of items in the cache
+
+    This solves the O(N*K) lookup bottleneck of the linear scan cache.
+    """
+
+    def __init__(self, capacity_bytes: int = (2 << 30)):
+        super().__init__(capacity_bytes)
+        self.root = TrieNode() # The root node of the Trie
+        self._current_size = 0  # O(1) tracking of cache size in bytes
+
+        # LRU Tracker:
+        # Key: Cached token sequence (Tuple[int, ...])
+        # Value: The *terminal* TrieNode for that key
+        self.lru_tracker: OrderedDict[
+            Tuple[int, ...], TrieNode
+        ] = OrderedDict()
+
+    @property
+    def cache_size(self) -> int:
+        """Returns the current total size of the cache in bytes (O(1))."""
+        return self._current_size
+
+    def _find_longest_prefix_node(
+        self, key: Tuple[int, ...]
+    ) -> Tuple[Optional[TrieNode], Optional[Tuple[int, ...]]]:
+        """
+        Finds the longest cached prefix for a given key in O(K) time.
+
+        Returns: (The matching TrieNode, The matching key)
+        """
+        node = self.root
+        longest_prefix_node: Optional[TrieNode] = None
+        longest_prefix_key: Optional[Tuple[int, ...]] = None
+        current_prefix: List[int] = []
+
+        # Check if the empty prefix (root) is cached
+        if node.state is not None:
+            longest_prefix_node = node
+            longest_prefix_key = tuple(current_prefix)
+
+        for token in key:
+            if token not in node.children:
+                # Path ends, no further prefix matches
+                break
+
+            node = node.children[token]
+            current_prefix.append(token)
+
+            if node.state is not None:
+                # Found a valid, longer prefix; update our best match
+                longest_prefix_node = node
+                longest_prefix_key = tuple(current_prefix)
+
+        return longest_prefix_node, longest_prefix_key
+
+    def __getitem__(self, key: Sequence[int]) -> "llama_cpp.llama.LlamaState":
+        """
+        Retrieves the state for the longest matching prefix in O(K) time.
+        Updates the LRU status.
+        """
+        key_tuple = tuple(key)
+        node, prefix_key = self._find_longest_prefix_node(key_tuple)
+
+        if node is None or node.state is None or prefix_key is None:
+            raise KeyError(f"Key prefix not found in cache for: {key_tuple}")
+
+        # Move the accessed key to the end (most recently used) in O(1)
+        self.lru_tracker.move_to_end(prefix_key)
+
+        return node.state
+
+    def __contains__(self, key: Sequence[int]) -> bool:
+        """Checks if any prefix of the key is cached in O(K) time."""
+        node, _ = self._find_longest_prefix_node(tuple(key))
+        return node is not None
+
+    def _prune(self, key: Tuple[int, ...]):
+        """
+        (Helper) Removes a key and its state from the Trie.
+        Also removes empty parent nodes (branch pruning).
+        """
+        path: List[Tuple[TrieNode, int]] = [] # Stores (parent_node, token)
+        node = self.root
+
+        # 1. Find the node and record the path
+        for token in key:
+            if token not in node.children:
+                return # Key not found
+            path.append((node, token))
+            node = node.children[token]
+
+        # 2. Remove the state
+        if node.state is None:
+            return # Node has no state
+
+        self._current_size -= node.state.llama_state_size
+        node.state = None
+
+        # 3. Prune empty parent nodes backward
+        for parent, token in reversed(path):
+            child = parent.children[token]
+
+            # If the child node is now empty (no children, no state), delete it
+            if not child.children and child.state is None:
+                del parent.children[token]
+            else:
+                # Node is still in use, stop pruning
+                break
+
+    def __setitem__(self, key: Sequence[int], value: "llama_cpp.llama.LlamaState"):
+        """
+        Adds a (key, state) pair to the cache in O(K) time.
+        Handles LRU updates and eviction.
+        """
+        key_tuple = tuple(key)
+
+        # 1. Find or create nodes for the key (O(K))
+        node = self.root
+        for token in key_tuple:
+            node = node.children.setdefault(token, TrieNode())
+
+        # 2. Check if updating an existing item
+        if node.state is not None:
+            self._current_size -= node.state.llama_state_size
+
+        # 3. Set new state and update O(1) size
+        node.state = value
+        self._current_size += value.llama_state_size
+
+        # 4. Update LRU tracker (O(1))
+        if key_tuple in self.lru_tracker:
+            self.lru_tracker.move_to_end(key_tuple)
+        else:
+            self.lru_tracker[key_tuple] = node
+
+        # 5. Eviction logic
+        while self._current_size > self.capacity_bytes and self.lru_tracker:
+            # Get the least recently used item in O(1)
+            evicted_key, _ = self.lru_tracker.popitem(last=False)
+
+            # Remove the evicted item from the Trie
+            self._prune(evicted_key)
+
 # Alias for backwards compatibility
 LlamaCache = LlamaRAMCache
 
diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
@@ -323,6 +323,10 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
                 if settings.verbose:
                     print(f"Using disk cache with size {settings.cache_size}")
                 cache = llama_cpp.LlamaDiskCache(capacity_bytes=settings.cache_size)
+            elif settings.cache_type == "tire":
+                if settings.verbose:
+                    print(f"Using tire cache with size {settings.cache_size}")
+                cache = llama_cpp.LlamaTrieCache(capacity_bytes=settings.cache_size)
             else:
                 if settings.verbose:
                     print(f"Using ram cache with size {settings.cache_size}")
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
@@ -159,7 +159,7 @@ class ModelSettings(BaseSettings):
         default=False,
         description="Use a cache to reduce processing times for evaluated prompts.",
     )
-    cache_type: Literal["ram", "disk"] = Field(
+    cache_type: Literal["ram", "trie", "disk"] = Field(
         default="ram",
         description="The type of cache to use. Only used if cache is True.",
     )

Original file line number	Diff line number	Diff line change
`@@ -159,7 +159,7 @@ class ModelSettings(BaseSettings):`
`159`	`159`	`default=False,`
`160`	`160`	`description="Use a cache to reduce processing times for evaluated prompts.",`
`161`	`161`	`)`
`162`		`- cache_type: Literal["ram", "disk"] = Field(`
	`162`	`+ cache_type: Literal["ram", "trie", "disk"] = Field(`
`163`	`163`	`default="ram",`
`164`	`164`	`description="The type of cache to use. Only used if cache is True.",`
`165`	`165`	`)`