Completes OPEN-5689 Support streaming for OpenAI Monitor

gustavocidornelas · whoseoyster · commit 23bb675df034 · 2024-03-06T13:02:38.000-08:00
diff --git a/examples/monitoring/quickstart/llms/openai_llm_monitor.ipynb b/examples/monitoring/quickstart/llms/openai_llm_monitor.ipynb
@@ -96,6 +96,14 @@
     "That's it! Now you can continue using OpenAI LLMs normally. The data is automatically published to Openlayer and you can start creating tests around it!"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "397097b4-aea9-4064-8621-4e0d2077da6d",
+   "metadata": {},
+   "source": [
+    "#### If you call the `create` method with `stream=False` (default):"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -114,6 +122,46 @@
     ")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "dff26b5d-4e86-4863-9f86-5dc98fe51140",
+   "metadata": {},
+   "source": [
+    "#### If you call the `create` method with `stream=True`:\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aee9d5c7-496b-48ca-8095-7e79c0753712",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chunks = openai_client.chat.completions.create(\n",
+    "    model=\"gpt-3.5-turbo\",\n",
+    "    messages=[\n",
+    "        {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
+    "        {\"role\": \"user\", \"content\": \"How are you doing today?\"},\n",
+    "        {\"role\": \"assistant\", \"content\": \"Pretty well! How about you?\"},\n",
+    "        {\"role\": \"user\", \"content\": \"I am doing well, but would like some words of encouragement.\"},\n",
+    "    ],\n",
+    "    stream=True   \n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "20d15545-dab2-4763-83f0-6dafb2834886",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Collect the messages from the stream\n",
+    "collected_messages = []\n",
+    "for chunk in chunks:\n",
+    "    collected_messages.append(chunk.choices[0].delta.content)   "
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -149,7 +197,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.13"
+   "version": "3.9.18"
   }
  },
  "nbformat": 4,
diff --git a/openlayer/llm_monitors.py b/openlayer/llm_monitors.py
@@ -177,44 +177,118 @@ def _get_modified_create_chat_completion(self) -> callable:
         """Returns a modified version of the create method for openai.ChatCompletion."""
 
         def modified_create_chat_completion(*args, **kwargs) -> str:
-            start_time = time.time()
-            response = self.create_chat_completion(*args, **kwargs)
-            latency = (time.time() - start_time) * 1000
+            stream = kwargs.get("stream", False)
 
-            try:
-                # Extract data
-                prompt, input_data = self.format_input(kwargs["messages"])
-                output_data = response.choices[0].message.content.strip()
-                num_of_tokens = response.usage.total_tokens
-                cost = self.get_cost_estimate(
-                    model=kwargs.get("model"),
-                    num_input_tokens=response.usage.prompt_tokens,
-                    num_output_tokens=response.usage.completion_tokens,
-                )
+            if not stream:
+                start_time = time.time()
+                response = self.create_chat_completion(*args, **kwargs)
+                latency = (time.time() - start_time) * 1000
 
-                # Prepare config
-                config = self.data_config.copy()
-                config["prompt"] = prompt
-                if not self.monitor_output_only:
-                    config.update({"inputVariableNames": list(input_data.keys())})
-
-                self._append_row_to_df(
-                    input_data=input_data,
-                    output_data=output_data,
-                    num_of_tokens=num_of_tokens,
-                    latency=latency,
-                    cost=cost,
-                )
+                try:
+                    # Extract data
+                    prompt, input_data = self.format_input(kwargs["messages"])
+                    output_data = response.choices[0].message.content.strip()
+                    num_of_tokens = response.usage.total_tokens
+                    cost = self.get_cost_estimate(
+                        model=kwargs.get("model"),
+                        num_input_tokens=response.usage.prompt_tokens,
+                        num_output_tokens=response.usage.completion_tokens,
+                    )
 
-                self.data_streamer.stream_data(
-                    data=self.df.tail(1).to_dict(orient="records"),
-                    config=config,
-                )
-            # pylint: disable=broad-except
-            except Exception as e:
-                logger.error("Failed to monitor chat request. %s", e)
+                    # Prepare config
+                    config = self.data_config.copy()
+                    config["prompt"] = prompt
+                    if not self.monitor_output_only:
+                        config.update({"inputVariableNames": list(input_data.keys())})
 
-            return response
+                    self._append_row_to_df(
+                        input_data=input_data,
+                        output_data=output_data,
+                        num_of_tokens=num_of_tokens,
+                        latency=latency,
+                        cost=cost,
+                    )
+
+                    self.data_streamer.stream_data(
+                        data=self.df.tail(1).to_dict(orient="records"),
+                        config=config,
+                    )
+                # pylint: disable=broad-except
+                except Exception as e:
+                    logger.error("Failed to monitor chat request. %s", e)
+
+                return response
+            else:
+                chunks = self.create_chat_completion(*args, **kwargs)
+
+                def stream_chunks():
+                    collected_messages = []
+                    start_time = time.time()
+                    first_token_time = None
+                    num_of_completion_tokens = None
+                    try:
+                        i = 0
+                        for i, chunk in enumerate(chunks):
+                            if i == 0:
+                                first_token_time = time.time()
+                            collected_messages.append(chunk.choices[0].delta.content)
+                            yield chunk
+                        if i > 0:
+                            num_of_completion_tokens = i + 1
+                    # pylint: disable=broad-except
+                    except Exception as e:
+                        logger.error("Failed to monitor chat request. %s", e)
+                    finally:
+                        try:
+                            # Extract data
+                            prompt, input_data = self.format_input(kwargs["messages"])
+                            collected_messages = [
+                                m for m in collected_messages if m is not None
+                            ]
+                            output_data = "".join(collected_messages)
+                            completion_cost = self.get_cost_estimate(
+                                model=kwargs.get("model"),
+                                num_input_tokens=0,
+                                num_output_tokens=(
+                                    num_of_completion_tokens
+                                    if num_of_completion_tokens
+                                    else 0
+                                ),
+                            )
+                            latency = (time.time() - start_time) * 1000
+
+                            # Prepare config
+                            config = self.data_config.copy()
+                            config["prompt"] = prompt
+                            if not self.monitor_output_only:
+                                config.update(
+                                    {"inputVariableNames": list(input_data.keys())}
+                                )
+
+                            self._append_row_to_df(
+                                input_data=input_data,
+                                output_data=output_data,
+                                num_of_tokens=num_of_completion_tokens,
+                                latency=latency,
+                                cost=completion_cost,
+                                time_to_first_token=(
+                                    (first_token_time - start_time) * 1000
+                                    if first_token_time
+                                    else None
+                                ),
+                                completion_tokens=num_of_completion_tokens,
+                                completion_cost=completion_cost,
+                            )
+
+                            self.data_streamer.stream_data(
+                                data=self.df.tail(1).to_dict(orient="records"),
+                                config=config,
+                            )
+                        # pylint: disable=broad-except
+                        except Exception as e:
+                            logger.error("Failed to monitor chat request. %s", e)
+
+                return stream_chunks()
 
         return modified_create_chat_completion
 
@@ -348,9 +422,10 @@ def _append_row_to_df(
         self,
         input_data: Dict[str, str],
         output_data: str,
-        num_of_tokens: int,
         latency: float,
+        num_of_tokens: int,
         cost: float,
+        **kwargs,
     ) -> None:
         """Appends a row with input/output, number of tokens, and latency to the
         df."""
@@ -367,6 +442,7 @@ def _append_row_to_df(
                         "latency": latency,
                         "cost": cost,
                     },
+                    **kwargs,
                 }
             ]
         )