@@ -50,20 +50,99 @@ def supported_models(cls) -> list[str]:
5050 async def generate_content_async (
5151 self , llm_request : LlmRequest , stream : bool = False
5252 ) -> AsyncGenerator [LlmResponse , None ]:
53- """Generates one content from the given contents and tools.
53+ """Generates content for a single model turn.
54+
55+ This method handles Server-Sent Events (SSE) streaming for unidirectional
56+ content generation. For bidirectional streaming (e.g., Gemini Live API),
57+ use the `connect()` method instead.
5458
5559 Args:
5660 llm_request: LlmRequest, the request to send to the LLM.
57- stream: bool = False, whether to do streaming call .
61+ stream: bool = False, whether to enable SSE streaming mode .
5862
5963 Yields:
60- a generator of types.Content.
64+ LlmResponse objects representing the model's response for one turn.
65+
66+ **Non-streaming mode (stream=False):**
67+
68+ Yields exactly one LlmResponse containing the complete model output
69+ (text, function calls, bytes, etc.). This response has `partial=False`.
70+
71+ **Streaming mode (stream=True):**
72+
73+ Yields multiple LlmResponse objects as chunks arrive:
74+
75+ - Intermediate chunks: `partial=True` (progressive updates)
76+ - Final chunk: `partial=False` (aggregated content from entire turn,
77+ identical to stream=False output)
78+ - Text consolidation: Consecutive text parts of the same type
79+ (thought/non-thought) SHOULD merge without separator, but client
80+ code must not rely on this - unconsolidated parts are unusual but also
81+ valid
82+
83+ **Common content in partial chunks:**
84+
85+ All intermediate chunks have `partial=True` regardless of content type.
86+ Common examples include:
87+
88+ - Text: Streams incrementally as tokens arrive
89+ - Function calls: May arrive in separate chunks
90+ - Bytes (e.g., images): Typically arrive as single chunk, interleaved
91+ with text
92+ - Thoughts: Stream incrementally when thinking_config is enabled
93+
94+ **Examples:**
95+
96+ 1. Simple text streaming::
97+
98+ LlmResponse(partial=True, parts=["The weather"])
99+ LlmResponse(partial=True, parts=[" in Tokyo is"])
100+ LlmResponse(partial=True, parts=[" sunny."])
101+ LlmResponse(partial=False, parts=["The weather in Tokyo is sunny."])
102+
103+ 2. Text + function call::
104+
105+ LlmResponse(partial=True, parts=[Text("Let me check...")])
106+ LlmResponse(partial=True, parts=[FunctionCall("get_weather", ...)])
107+ LlmResponse(partial=False, parts=[Text("Let me check..."),
108+ FunctionCall("get_weather", ...)])
109+
110+ 3. Parallel function calls across chunks::
111+
112+ LlmResponse(partial=True, parts=[Text("Checking both cities...")])
113+ LlmResponse(partial=True, parts=[FunctionCall("get_weather", Tokyo)])
114+ LlmResponse(partial=True, parts=[FunctionCall("get_weather", NYC)])
115+ LlmResponse(partial=False, parts=[Text("Checking both cities..."),
116+ FunctionCall("get_weather", Tokyo),
117+ FunctionCall("get_weather", NYC)])
118+
119+ 4. Text + bytes (image generation with gemini-2.5-flash-image)::
120+
121+ LlmResponse(partial=True, parts=[Text("Here's an image of a dog.")])
122+ LlmResponse(partial=True, parts=[Text("\n ")])
123+ LlmResponse(partial=True, parts=[Blob(image/png, 1.6MB)])
124+ LlmResponse(partial=True, parts=[Text("It carries a bone")])
125+ LlmResponse(partial=True, parts=[Text(" and running around.")])
126+ LlmResponse(partial=False, parts=[Text("Here's an image of a dog.\n "),
127+ Blob(image/png, 1.6MB),
128+ Text("It carries a bone and running around.")])
129+
130+ Note: Consecutive text parts before and after blob merge separately.
131+
132+ 5. Text with thinking (gemini-2.5-flash with thinking_config)::
133+
134+ LlmResponse(partial=True, parts=[Thought("Let me analyze...")])
135+ LlmResponse(partial=True, parts=[Thought("The user wants...")])
136+ LlmResponse(partial=True, parts=[Text("Based on my analysis,")])
137+ LlmResponse(partial=True, parts=[Text(" the answer is 42.")])
138+ LlmResponse(partial=False, parts=[Thought("Let me analyze...The user wants..."),
139+ Text("Based on my analysis, the answer is 42.")])
61140
62- For non-streaming call, it will only yield one Content .
141+ Note: Consecutive parts of same type merge (thoughts→thought, text→text) .
63142
64- For streaming call, it may yield more than one content, but all yielded
65- contents should be treated as one content by merging the
66- parts list .
143+ **Important:** All yielded responses represent one logical model turn.
144+ The final response with `partial=False` should be identical to the
145+ response that would be received with `stream=False` .
67146 """
68147 raise NotImplementedError (
69148 f'Async generation is not supported for { self .model } .'
0 commit comments