Completes OPEN-5692 Stream trace to Openlayer

gustavocidornelas · whoseoyster · commit 0865453794b9 · 2024-03-18T22:03:47.000-07:00
diff --git a/openlayer/llm_monitors.py b/openlayer/llm_monitors.py
@@ -126,14 +126,17 @@ def __init__(
         self.monitor_output_only = monitor_output_only
         self.monitoring_on = False
         self.df = pd.DataFrame(columns=["input", "output", "tokens", "latency"])
-
-        self.data_streamer = data_streamer.DataStreamer(
-            openlayer_api_key=openlayer_api_key,
-            openlayer_project_name=openlayer_project_name,
-            openlayer_inference_pipeline_name=openlayer_inference_pipeline_name,
-            openlayer_inference_pipeline_id=openlayer_inference_pipeline_id,
-            publish=publish,
-        )
+        self.publish = publish
+        self.data_streamer = None
+
+        if self.publish is True:
+            self.data_streamer = data_streamer.DataStreamer(
+                openlayer_api_key=openlayer_api_key,
+                openlayer_project_name=openlayer_project_name,
+                openlayer_inference_pipeline_name=openlayer_inference_pipeline_name,
+                openlayer_inference_pipeline_id=openlayer_inference_pipeline_id,
+                publish=publish,
+            )
 
     def __enter__(self):
         self.start_monitoring()
@@ -473,7 +476,7 @@ def start_monitoring(self) -> None:
         self.monitoring_on = True
         self._overwrite_completion_methods()
         print("All the calls to OpenAI models are now being monitored!")
-        if self.data_streamer.publish:
+        if self.publish:
             print(
                 "Furthermore, since `publish` was set to True, the data is being"
                 f" published to your '{self.data_streamer.openlayer_project_name}' Openlayer project."
@@ -502,7 +505,7 @@ def stop_monitoring(self):
         self._restore_completion_methods()
         self.monitoring_on = False
         print("Monitoring stopped.")
-        if not self.data_streamer.publish:
+        if not self.publish:
             print(
                 "To publish the data collected so far to your Openlayer project, "
                 "call the `publish_batch_data` method."
@@ -520,7 +523,7 @@ def _restore_completion_methods(self) -> None:
     def publish_batch_data(self):
         """Manually publish the accumulated data to Openlayer when automatic publishing
         is disabled (i.e., ``publish=False``)."""
-        if self.data_streamer.publish:
+        if self.publish:
             print(
                 "You have set `publish` to True, so every request you've made so far"
                 " was already published to Openlayer."
diff --git a/openlayer/services/data_streamer.py b/openlayer/services/data_streamer.py
@@ -3,6 +3,7 @@
 Validates the arguments needed for data streaming and handles the streaming
 process.
 """
+
 import logging
 from typing import Dict, Optional
 
@@ -79,6 +80,14 @@ def _validate_attributes(self) -> None:
                 "or set the OPENLAYER_INFERENCE_PIPELINE_ID or"
                 " OPENLAYER_INFERENCE_PIPELINE_NAME environment variables."
             )
+        logger.info(
+            "Data will be streamed to Openlayer project %s and inference pipeline %s.",
+            self.openlayer_project_name,
+            (
+                self.openlayer_inference_pipeline_id
+                or self.openlayer_inference_pipeline_name
+            ),
+        )
 
     def stream_data(self, data: Dict[str, any], config: Dict[str, any]) -> None:
         """Stream data to the Openlayer platform.
@@ -90,6 +99,7 @@ def stream_data(self, data: Dict[str, any], config: Dict[str, any]) -> None:
 
         self._check_inference_pipeline_ready()
         self.inference_pipeline.stream_data(stream_data=data, stream_config=config)
+        logger.info("Data streamed to Openlayer.")
 
     def _check_inference_pipeline_ready(self) -> None:
         """Lazy load the inference pipeline and check if it is ready."""
@@ -144,3 +154,4 @@ def publish_batch_data(self, df: pd.DataFrame, config: Dict[str, any]) -> None:
         """
         self._check_inference_pipeline_ready()
         self.inference_pipeline.publish_batch_data(batch_df=df, batch_config=config)
+        logger.info("Batch of data published to Openlayer.")
diff --git a/openlayer/tracing/steps.py b/openlayer/tracing/steps.py
@@ -29,8 +29,8 @@ def add_nested_step(self, nested_step: "Step") -> None:
         """Adds a nested step to the current step."""
         self.steps.append(nested_step)
 
-    def update_data(self, **kwargs: Any) -> None:
-        """Updates the step data."""
+    def log(self, **kwargs: Any) -> None:
+        """Logs step data."""
         for key, value in kwargs.items():
             if hasattr(self, key):
                 setattr(self, key, value)
diff --git a/openlayer/tracing/tracer.py b/openlayer/tracing/tracer.py
@@ -1,14 +1,28 @@
 """Module with the logic to create and manage traces and steps."""
 
+import contextvars
 import inspect
-from typing import Any, Dict, Optional, Generator
+import logging
+import time
 from contextlib import contextmanager
-import contextvars
 from functools import wraps
+from typing import Any, Dict, Generator, List, Optional, Tuple
 
-from . import steps
-from . import traces
-import time
+from ..services import data_streamer
+from . import steps, traces
+
+logger = logging.getLogger(__name__)
+
+_streamer = None
+try:
+    _streamer = data_streamer.DataStreamer(publish=True)
+except Exception as exc:
+    logger.error(
+        "You have not provided enough information to upload traces to Openlayer."
+        "\n%s \n"
+        "To upload the traces, please provide the missing information and try again.",
+        exc,
+    )
 
 _current_step = contextvars.ContextVar("current_step")
 _current_trace = contextvars.ContextVar("current_trace")
@@ -23,36 +37,85 @@ def create_step(
     metadata: Dict[str, any] = {},
 ) -> Generator[steps.Step, None, None]:
     """Starts a trace and yields a Step object."""
-    new_step = steps.step_factory(
+    new_step: steps.Step = steps.step_factory(
         step_type=step_type, name=name, inputs=inputs, output=output, metadata=metadata
     )
-    parent_step = _current_step.get(None)
-    is_root_step = parent_step is None
+
+    parent_step: Optional[steps.Step] = _current_step.get(None)
+    is_root_step: bool = parent_step is None
 
     if parent_step is None:
-        print("Starting a new trace...")
+        logger.debug("Starting a new trace...")
         current_trace = traces.Trace()
         _current_trace.set(current_trace)  # Set the current trace in context
         current_trace.add_step(new_step)
     else:
-        print(f"Adding step {name} to parent step {parent_step.name}")
+        logger.debug(f"Adding step {name} to parent step {parent_step.name}")
         current_trace = _current_trace.get()
         parent_step.add_nested_step(new_step)
 
     token = _current_step.set(new_step)
-
     try:
         yield new_step
     finally:
         _current_step.reset(token)
         if is_root_step:
-            print("Ending the trace...")
-            print("-" * 80)
-            print(current_trace.to_dict())
-            print("-" * 80)
+            logger.debug("Ending the trace...")
+            trace_data, input_variable_names = process_trace_for_upload(current_trace)
+            config = {
+                "outputColumnName": "output",
+                "inputVariableNames": input_variable_names,
+                "label": "production",
+                "groundTruthColumnName": "groundTruth",
+                "latencyColumnName": "latency",
+            }
+            if isinstance(new_step, steps.OpenAIChatCompletionStep):
+                config.update(
+                    {
+                        "costColumnName": "cost",
+                        "numOfTokenColumnName": "tokens",
+                        "prompt": new_step.inputs.get("prompt"),
+                    }
+                )
+            if _streamer:
+                _streamer.stream_data(data=trace_data, config=config)
+            else:
+                logger.warning(
+                    "Trace computed but not uploaded to Openlayer. "
+                    "You have not provided enough information to upload traces to"
+                    " Openlayer."
+                )
         else:
-            # TODO: stream to Openlayer
-            print(f"Ending step {name}")
+            logger.debug(f"Ending step {name}")
+
+
+def process_trace_for_upload(trace: traces.Trace) -> Tuple[Dict[str, Any], List[str]]:
+    """Post processing of the trace data before uploading to Openlayer.
+
+    This is done to ensure backward compatibility with data on Openlayer.
+    """
+    root_step = trace.steps[0]
+
+    input_variables = root_step.inputs
+    input_variable_names = list(input_variables.keys())
+
+    trace_data = {
+        **input_variables,
+        "output": root_step.output,
+        "groundTruth": root_step.ground_truth,
+        "latency": root_step.latency,
+        "steps": trace.to_dict(),
+    }
+    # Extra fields for openai_chat_completion step
+    if isinstance(root_step, steps.OpenAIChatCompletionStep):
+        trace_data.update(
+            {
+                "cost": root_step.cost,
+                "tokens": root_step.prompt_tokens + root_step.completion_tokens,
+            }
+        )
+
+    return trace_data, input_variable_names
 
 
 def trace(*step_args, **step_kwargs):
@@ -74,7 +137,7 @@ def wrapper(*func_args, **func_kwargs):
                 inputs.pop("self", None)
                 inputs.pop("cls", None)
 
-                step.update_data(
+                step.log(
                     inputs=inputs,
                     output=output,
                     end_time=end_time,
diff --git a/openlayer/tracing/traces.py b/openlayer/tracing/traces.py
@@ -1,6 +1,6 @@
 """Module with the Trace class."""
 
-from typing import Any, Dict
+from typing import Any, Dict, List
 
 from .steps import Step
 
@@ -14,6 +14,6 @@ def add_step(self, step: Step) -> None:
         """Adds a step to the trace."""
         self.steps.append(step)
 
-    def to_dict(self) -> Dict[str, Any]:
+    def to_dict(self) -> List[Dict[str, Any]]:
         """Dictionary representation of the Trace."""
-        return {"rows": [step.to_dict() for step in self.steps]}
+        return [step.to_dict() for step in self.steps]