feat: add usage to streamin response

okaris · okaris · commit c3debdf0efa0 · 2025-07-04T11:53:47.000Z
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -1054,6 +1054,50 @@ def decode_batch(seq_sizes: List[int]):
         else:
             return output
 
+    def _create_chunk(
+        self,
+        completion_id: str,
+        created: int,
+        model_name: str,
+        text: str,
+        logprobs_or_none: Union[Optional[CompletionLogprobs], None],
+        index: int,
+        finish_reason: Union[str, None],
+        usage: Optional[Dict[str, Any]] = None,
+    ) -> CreateCompletionStreamResponse:
+        """Create chunks for streaming API, depending on whether usage is requested or not."""
+        if usage is not None:
+            return {
+                "id": completion_id,
+                "object": "text_completion",
+                "created": created,
+                "model": model_name,
+                "choices": [
+                    {
+                        "text": text,
+                        "index": index,
+                        "logprobs": logprobs_or_none,
+                        "finish_reason": finish_reason,
+                    }
+                ],
+                "usage": usage,
+            }
+        else:
+            return {
+                "id": completion_id,
+                "object": "text_completion",
+                "created": created,
+                "model": model_name,
+                "choices": [
+                    {
+                        "text": text,
+                        "index": index,
+                        "logprobs": logprobs_or_none,
+                        "finish_reason": finish_reason,
+                    }
+                ],
+            }
+
     def _create_completion(
         self,
         prompt: Union[str, List[int]],
@@ -1380,24 +1424,20 @@ def logit_bias_processor(
                             "top_logprobs": [top_logprob],
                         }
                         returned_tokens += 1
-                        yield {
-                            "id": completion_id,
-                            "object": "text_completion",
-                            "created": created,
-                            "model": model_name,
-                            "choices": [
-                                {
-                                    "text": self.detokenize(
-                                        [token],
-                                        prev_tokens=prompt_tokens
-                                        + completion_tokens[:returned_tokens],
-                                    ).decode("utf-8", errors="ignore"),
-                                    "index": 0,
-                                    "logprobs": logprobs_or_none,
-                                    "finish_reason": None,
-                                }
-                            ],
-                        }
+                        yield self._create_chunk(
+                            completion_id=completion_id,
+                            created=created,
+                            model_name=model_name,
+                            text=self.detokenize(
+                                [token],
+                                prev_tokens=prompt_tokens
+                                + completion_tokens[:returned_tokens],
+                            ).decode("utf-8", errors="ignore"),
+                            logprobs_or_none=logprobs_or_none,
+                            index=0,
+                            finish_reason=None,
+                            usage=None,
+                        )
                 else:
                     while len(remaining_tokens) > 0:
                         decode_success = False
@@ -1426,20 +1466,16 @@ def logit_bias_processor(
                         remaining_tokens = remaining_tokens[i:]
                         returned_tokens += i
 
-                        yield {
-                            "id": completion_id,
-                            "object": "text_completion",
-                            "created": created,
-                            "model": model_name,
-                            "choices": [
-                                {
-                                    "text": ts,
-                                    "index": 0,
-                                    "logprobs": None,
-                                    "finish_reason": None,
-                                }
-                            ],
-                        }
+                        yield self._create_chunk(
+                            completion_id=completion_id,
+                            created=created,
+                            model_name=model_name,
+                            text=ts,
+                            logprobs_or_none=None,
+                            index=0,
+                            finish_reason=None,
+                            usage=None,
+                        )
 
             if len(completion_tokens) >= max_tokens:
                 text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens)
@@ -1518,54 +1554,51 @@ def logit_bias_processor(
                     if token_end_position == end - 1:
                         break
                     returned_tokens += 1
-                    yield {
-                        "id": completion_id,
-                        "object": "text_completion",
-                        "created": created,
-                        "model": model_name,
-                        "choices": [
-                            {
-                                "text": last_text[
-                                    : len(last_text) - (token_end_position - end)
-                                ].decode("utf-8", errors="ignore"),
-                                "index": 0,
-                                "logprobs": logprobs_or_none,
-                                "finish_reason": None,
-                            }
-                        ],
-                    }
+                    yield self._create_chunk(
+                        completion_id=completion_id,
+                        created=created,
+                        model_name=model_name,
+                        text=last_text[
+                            : len(last_text) - (token_end_position - end)
+                        ].decode("utf-8", errors="ignore"),
+                        logprobs_or_none=logprobs_or_none,
+                        index=0,
+                        finish_reason=None,
+                        usage=None,
+                    )
                     break
                 returned_tokens += 1
-                yield {
-                    "id": completion_id,
-                    "object": "text_completion",
-                    "created": created,
-                    "model": model_name,
-                    "choices": [
-                        {
-                            "text": self.detokenize([token]).decode(
-                                "utf-8", errors="ignore"
-                            ),
-                            "index": 0,
-                            "logprobs": logprobs_or_none,
-                            "finish_reason": None,
-                        }
-                    ],
-                }
-            yield {
-                "id": completion_id,
-                "object": "text_completion",
-                "created": created,
-                "model": model_name,
-                "choices": [
-                    {
-                        "text": "",
-                        "index": 0,
-                        "logprobs": None,
-                        "finish_reason": finish_reason,
-                    }
-                ],
+                yield self._create_chunk(
+                    completion_id=completion_id,
+                    created=created,
+                    model_name=model_name,
+                    text=self.detokenize([token]).decode(
+                        "utf-8", errors="ignore"
+                    ),
+                    logprobs_or_none=logprobs_or_none,
+                    index=0,
+                    finish_reason=None,
+                    usage=None,
+                )
+
+            # Final streaming chunk with both finish_reason and usage
+            usage = {
+                "prompt_tokens": len(prompt_tokens),
+                "completion_tokens": returned_tokens,
+                "total_tokens": len(prompt_tokens) + returned_tokens,
             }
+
+            yield self._create_chunk(
+                completion_id=completion_id,
+                created=created,
+                model_name=model_name,
+                text="",
+                logprobs_or_none=None,
+                index=0,
+                finish_reason=finish_reason,
+                usage=usage,
+            )
+
             if self.cache:
                 if self.verbose:
                     print("Llama._create_completion: cache save", file=sys.stderr)
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
@@ -350,6 +350,7 @@ def _convert_text_completion_chunks_to_chat(
                     "finish_reason": chunk["choices"][0]["finish_reason"],
                 }
             ],
+            "usage": chunk.get("usage") if "usage" in chunk else None,
         }
 
 
@@ -434,7 +435,7 @@ def _stream_response_to_function_stream(
                     created = chunk["created"]
                     model = chunk["model"]
                     tool_id = "call_" + "_0_" + tool_name + "_" + chunk["id"]
-                    yield {
+                    response = {
                         "id": id_,
                         "object": "chat.completion.chunk",
                         "created": created,
@@ -453,7 +454,11 @@ def _stream_response_to_function_stream(
                             }
                         ],
                     }
-                    yield {
+                    if "usage" in chunk:
+                        response["usage"] = chunk["usage"]
+                    yield response
+
+                    response = {
                         "id": "chat" + chunk["id"],
                         "object": "chat.completion.chunk",
                         "created": chunk["created"],
@@ -487,10 +492,14 @@ def _stream_response_to_function_stream(
                             }
                         ],
                     }
+                    if "usage" in chunk:
+                        response["usage"] = chunk["usage"]
+                    yield response
                     first = False
                     continue
+
                 assert tool_id is not None
-                yield {
+                response = {
                     "id": "chat" + chunk["id"],
                     "object": "chat.completion.chunk",
                     "created": chunk["created"],
@@ -522,9 +531,12 @@ def _stream_response_to_function_stream(
                         }
                     ],
                 }
+                if "usage" in chunk:
+                    response["usage"] = chunk["usage"]
+                yield response
 
             if id_ is not None and created is not None and model is not None:
-                yield {
+                response = {
                     "id": id_,
                     "object": "chat.completion.chunk",
                     "created": created,
@@ -543,6 +555,9 @@ def _stream_response_to_function_stream(
                         }
                     ],
                 }
+                if "usage" in chunk:
+                    response["usage"] = chunk["usage"]
+                yield response
 
         return _stream_response_to_function_stream(chunks)
 
@@ -2123,6 +2138,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                 },
                             }
                         ],
+                        usage=chunk["usage"] if "usage" in chunk else None,
                     )
                     first = False
                 if tools is not None:
@@ -2163,6 +2179,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                 },
                             }
                         ],
+                        usage=chunk["usage"] if "usage" in chunk else None,
                     )
             # Yield tool_call/function_call stop message
             yield llama_types.CreateChatCompletionStreamResponse(
@@ -2185,6 +2202,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                         },
                     }
                 ],
+                usage=chunk["usage"] if "usage" in chunk else None,
             )
         # If "auto" or no tool_choice/function_call
         elif isinstance(function_call, str) and function_call == "auto":
@@ -2220,6 +2238,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                 "finish_reason": None,
                             }
                         ],
+                        usage=chunk["usage"] if "usage" in chunk else None,
                     )
                 else:
                     prompt += f"{function_name}\n<|content|>"
@@ -2265,6 +2284,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                 },
                             }
                         ],
+                        usage=chunk["usage"] if "usage" in chunk else None,
                     )
                 # Generate content
                 stops = [RECIPIENT_TOKEN, STOP_TOKEN]
@@ -2302,6 +2322,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                                 },
                                             }
                                         ],
+                                        usage=chunk["usage"] if "usage" in chunk else None,
                                     )
                                 is_end = False
                         elif chunk["choices"][0]["text"] == "\n":
@@ -2331,6 +2352,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                         },
                                     }
                                 ],
+                                usage=chunk["usage"] if "usage" in chunk else None,
                             )
                     # Check whether the model wants to generate another turn
                     if (
@@ -2363,6 +2385,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                     "finish_reason": "stop",
                                 }
                             ],
+                            usage=chunk["usage"] if "usage" in chunk else None,
                         )
                         break
                 else:
@@ -2412,6 +2435,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                         },
                                     }
                                 ],
+                                usage=chunk["usage"] if "usage" in chunk else None,
                             )
                     prompt += completion_text.strip()
                     grammar = None
@@ -2451,6 +2475,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                     },
                                 }
                             ],
+                            usage=chunk["usage"] if "usage" in chunk else None,
                         )
                         break
 
diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py
@@ -154,13 +154,13 @@ class ChatCompletionStreamResponseChoice(TypedDict):
     finish_reason: Optional[Literal["stop", "length", "tool_calls", "function_call"]]
     logprobs: NotRequired[Optional[ChatCompletionLogprobs]]
 
-
 class CreateChatCompletionStreamResponse(TypedDict):
     id: str
     model: str
     object: Literal["chat.completion.chunk"]
     created: int
     choices: List[ChatCompletionStreamResponseChoice]
+    usage: NotRequired[CompletionUsage]
 
 
 class ChatCompletionFunctions(TypedDict):

Original file line number	Diff line number	Diff line change
`@@ -350,6 +350,7 @@ def _convert_text_completion_chunks_to_chat(`
`350`	`350`	`"finish_reason": chunk["choices"][0]["finish_reason"],`
`351`	`351`	`}`
`352`	`352`	`],`
	`353`	`+ "usage": chunk.get("usage") if "usage" in chunk else None,`
`353`	`354`	`}`
`354`	`355`
`355`	`356`
`@@ -434,7 +435,7 @@ def _stream_response_to_function_stream(`
`434`	`435`	`created = chunk["created"]`
`435`	`436`	`model = chunk["model"]`
`436`	`437`	`tool_id = "call_" + "_0_" + tool_name + "_" + chunk["id"]`
`437`		`- yield {`
	`438`	`+ response = {`
`438`	`439`	`"id": id_,`
`439`	`440`	`"object": "chat.completion.chunk",`
`440`	`441`	`"created": created,`
`@@ -453,7 +454,11 @@ def _stream_response_to_function_stream(`
`453`	`454`	`}`
`454`	`455`	`],`
`455`	`456`	`}`
`456`		`- yield {`
	`457`	`+ if "usage" in chunk:`
	`458`	`+ response["usage"] = chunk["usage"]`
	`459`	`+ yield response`
	`460`	`+`
	`461`	`+ response = {`
`457`	`462`	`"id": "chat" + chunk["id"],`
`458`	`463`	`"object": "chat.completion.chunk",`
`459`	`464`	`"created": chunk["created"],`
`@@ -487,10 +492,14 @@ def _stream_response_to_function_stream(`
`487`	`492`	`}`
`488`	`493`	`],`
`489`	`494`	`}`
	`495`	`+ if "usage" in chunk:`
	`496`	`+ response["usage"] = chunk["usage"]`
	`497`	`+ yield response`
`490`	`498`	`first = False`
`491`	`499`	`continue`
	`500`	`+`
`492`	`501`	`assert tool_id is not None`
`493`		`- yield {`
	`502`	`+ response = {`
`494`	`503`	`"id": "chat" + chunk["id"],`
`495`	`504`	`"object": "chat.completion.chunk",`
`496`	`505`	`"created": chunk["created"],`
`@@ -522,9 +531,12 @@ def _stream_response_to_function_stream(`
`522`	`531`	`}`
`523`	`532`	`],`
`524`	`533`	`}`
	`534`	`+ if "usage" in chunk:`
	`535`	`+ response["usage"] = chunk["usage"]`
	`536`	`+ yield response`
`525`	`537`
`526`	`538`	`if id_ is not None and created is not None and model is not None:`
`527`		`- yield {`
	`539`	`+ response = {`
`528`	`540`	`"id": id_,`
`529`	`541`	`"object": "chat.completion.chunk",`
`530`	`542`	`"created": created,`
`@@ -543,6 +555,9 @@ def _stream_response_to_function_stream(`
`543`	`555`	`}`
`544`	`556`	`],`
`545`	`557`	`}`
	`558`	`+ if "usage" in chunk:`
	`559`	`+ response["usage"] = chunk["usage"]`
	`560`	`+ yield response`
`546`	`561`
`547`	`562`	`return _stream_response_to_function_stream(chunks)`
`548`	`563`
`@@ -2123,6 +2138,7 @@ def generate_streaming(tools, functions, function_call, prompt):`
`2123`	`2138`	`},`
`2124`	`2139`	`}`
`2125`	`2140`	`],`
	`2141`	`+ usage=chunk["usage"] if "usage" in chunk else None,`
`2126`	`2142`	`)`
`2127`	`2143`	`first = False`
`2128`	`2144`	`if tools is not None:`
`@@ -2163,6 +2179,7 @@ def generate_streaming(tools, functions, function_call, prompt):`
`2163`	`2179`	`},`
`2164`	`2180`	`}`
`2165`	`2181`	`],`
	`2182`	`+ usage=chunk["usage"] if "usage" in chunk else None,`
`2166`	`2183`	`)`
`2167`	`2184`	`# Yield tool_call/function_call stop message`
`2168`	`2185`	`yield llama_types.CreateChatCompletionStreamResponse(`
`@@ -2185,6 +2202,7 @@ def generate_streaming(tools, functions, function_call, prompt):`
`2185`	`2202`	`},`
`2186`	`2203`	`}`
`2187`	`2204`	`],`
	`2205`	`+ usage=chunk["usage"] if "usage" in chunk else None,`
`2188`	`2206`	`)`
`2189`	`2207`	`# If "auto" or no tool_choice/function_call`
`2190`	`2208`	`elif isinstance(function_call, str) and function_call == "auto":`
`@@ -2220,6 +2238,7 @@ def generate_streaming(tools, functions, function_call, prompt):`
`2220`	`2238`	`"finish_reason": None,`
`2221`	`2239`	`}`
`2222`	`2240`	`],`
	`2241`	`+ usage=chunk["usage"] if "usage" in chunk else None,`
`2223`	`2242`	`)`
`2224`	`2243`	`else:`
`2225`	`2244`	`prompt += f"{function_name}\n<\|content\|>"`
`@@ -2265,6 +2284,7 @@ def generate_streaming(tools, functions, function_call, prompt):`
`2265`	`2284`	`},`
`2266`	`2285`	`}`
`2267`	`2286`	`],`
	`2287`	`+ usage=chunk["usage"] if "usage" in chunk else None,`
`2268`	`2288`	`)`
`2269`	`2289`	`# Generate content`
`2270`	`2290`	`stops = [RECIPIENT_TOKEN, STOP_TOKEN]`
`@@ -2302,6 +2322,7 @@ def generate_streaming(tools, functions, function_call, prompt):`
`2302`	`2322`	`},`
`2303`	`2323`	`}`
`2304`	`2324`	`],`
	`2325`	`+ usage=chunk["usage"] if "usage" in chunk else None,`
`2305`	`2326`	`)`
`2306`	`2327`	`is_end = False`
`2307`	`2328`	`elif chunk["choices"][0]["text"] == "\n":`
`@@ -2331,6 +2352,7 @@ def generate_streaming(tools, functions, function_call, prompt):`
`2331`	`2352`	`},`
`2332`	`2353`	`}`
`2333`	`2354`	`],`
	`2355`	`+ usage=chunk["usage"] if "usage" in chunk else None,`
`2334`	`2356`	`)`
`2335`	`2357`	`# Check whether the model wants to generate another turn`
`2336`	`2358`	`if (`
`@@ -2363,6 +2385,7 @@ def generate_streaming(tools, functions, function_call, prompt):`
`2363`	`2385`	`"finish_reason": "stop",`
`2364`	`2386`	`}`
`2365`	`2387`	`],`
	`2388`	`+ usage=chunk["usage"] if "usage" in chunk else None,`
`2366`	`2389`	`)`
`2367`	`2390`	`break`
`2368`	`2391`	`else:`
`@@ -2412,6 +2435,7 @@ def generate_streaming(tools, functions, function_call, prompt):`
`2412`	`2435`	`},`
`2413`	`2436`	`}`
`2414`	`2437`	`],`
	`2438`	`+ usage=chunk["usage"] if "usage" in chunk else None,`
`2415`	`2439`	`)`
`2416`	`2440`	`prompt += completion_text.strip()`
`2417`	`2441`	`grammar = None`
`@@ -2451,6 +2475,7 @@ def generate_streaming(tools, functions, function_call, prompt):`
`2451`	`2475`	`},`
`2452`	`2476`	`}`
`2453`	`2477`	`],`
	`2478`	`+ usage=chunk["usage"] if "usage" in chunk else None,`
`2454`	`2479`	`)`
`2455`	`2480`	`break`
`2456`	`2481`