OPEN-5555: Added methods for sending stream data

Parthib · whoseoyster · commit 1b0ce237584c · 2023-12-02T13:34:19.000Z
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -20,6 +20,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 * Warnings if the dependencies from the `requirement_txt_file` and current environment are inconsistent.
 * Paths to custom SSL certificates can now be modified by altering `openlayer.api.VERIFY_REQUESTS`. The value can either be True (default), False, or a path to a certificate.
 * Ability to check for goal statuses through the API.
+* New method `send_stream_data` for inference pipelines that is used for real time streaming of small bits of data.
 
 ### Changed
 
diff --git a/openlayer/__init__.py b/openlayer/__init__.py
@@ -1046,7 +1046,7 @@ def upload_reference_dataset(
                 storage_uri_key="referenceDatasetUri",
                 method="PUT",
             )
-        print("Referece dataset uploaded!")
+        print("Reference dataset uploaded!")
 
     def upload_reference_dataframe(
         self,
@@ -1073,6 +1073,75 @@ def upload_reference_dataframe(
                 dataset_config_file_path=dataset_config_file_path,
                 task_type=task_type,
             )
+        
+    def send_stream_data(
+        self,
+        inference_pipeline_id: str,
+        task_type: TaskType,
+        stream_df: pd.DataFrame,
+        stream_config: Optional[Dict[str, any]] = None,
+        stream_config_file_path: Optional[str] = None,
+        verbose: bool = True,
+    ) -> None:
+        """Publishes a batch of production data to the Openlayer platform."""
+        if stream_config is None and stream_config_file_path is None:
+            raise ValueError(
+                "Either `batch_config` or `batch_config_file_path` must be" " provided."
+            )
+        if stream_config_file_path is not None and not os.path.exists(
+            stream_config_file_path
+        ):
+            raise exceptions.OpenlayerValidationError(
+                f"Stream config file path {stream_config_file_path} does not exist."
+            ) from None
+        elif stream_config_file_path is not None:
+            stream_config = utils.read_yaml(stream_config_file_path)
+
+        stream_config["label"] = "production"
+
+        # Validate stream of data
+        stream_validator = dataset_validators.get_validator(
+            task_type=task_type,
+            dataset_config=stream_config,
+            dataset_config_file_path=stream_config_file_path,
+            dataset_df=stream_df,
+        )
+        failed_validations = stream_validator.validate()
+
+        if failed_validations:
+            raise exceptions.OpenlayerValidationError(
+                "There are issues with the stream of data and its config. \n"
+                "Make sure to fix all of the issues listed above before the upload.",
+            ) from None
+
+        # Load dataset config and augment with defaults
+        stream_data = DatasetSchema().load(
+            {"task_type": task_type.value, **stream_config}
+        )
+
+        # Add default columns if not present
+        if stream_data.get("columnNames") is None:
+            stream_data["columnNames"] = list(stream_df.columns)
+        columns_to_add = {"timestampColumnName", "inferenceIdColumnName"}
+        for column in columns_to_add:
+            if stream_data.get(column) is None:
+                stream_data, stream_df = self._add_default_column(
+                    config=stream_data, df=stream_df, column_name=column
+                )
+
+
+        body = {
+            "datasetConfig": stream_data,
+            "dataset": stream_df.to_dict(orient="records"),
+        }
+
+        self.api.post_request(
+            endpoint=f"inference-pipelines/{inference_pipeline_id}/data-stream",
+            body=body,
+        )
+
+        if verbose:
+            print("Stream published!")
 
     def publish_batch_data(
         self,
diff --git a/openlayer/inference_pipelines.py b/openlayer/inference_pipelines.py
@@ -238,6 +238,81 @@ def upload_reference_dataframe(
             task_type=self.taskType,
             **kwargs,
         )
+    
+    def send_stream_data(self, *args, **kwargs):
+        """Publishes a stream of production data to the Openlayer platform.
+
+        Parameters
+        ----------
+        stream_df : pd.DataFrame
+            Dataframe containing the batch of production data.
+        stream_config : Dict[str, any], optional
+            Dictionary containing the batch configuration. This is not needed if
+            ``batch_config_file_path`` is provided.
+
+            .. admonition:: What's in the config?
+
+                The configuration for a batch of data depends on the :obj:`TaskType`.
+                Refer to the `How to write dataset configs guides <https://docs.openlayer.com/docs/tabular-classification-dataset-config>`_
+                for details. These configurations are
+                the same for development and batches of production data.
+
+        stream_config_file_path : str
+            Path to the configuration YAML file. This is not needed if
+            ``batch_config`` is provided.
+
+            .. admonition:: What's in the config file?
+
+                The configuration for a batch of data depends on the :obj:`TaskType`.
+                Refer to the `How to write dataset configs guides <https://docs.openlayer.com/docs/tabular-classification-dataset-config>`_
+                for details. These configurations are
+                the same for development and batches of production data.
+
+        Notes
+        -----
+        Production data usually has a column with the inference timestamps. This
+        column is specified in the ``timestampsColumnName`` of the batch config file,
+        and it should contain timestamps in the **UNIX format in seconds**.
+
+        Production data also usually has a column with the prediction IDs. This
+        column is specified in the ``inferenceIdColumnName`` of the batch config file.
+        This column is particularly important when the ground truths are not available
+        during inference time, and they are updated later.
+
+        If the above are not provided, **Openlayer will generate inference IDs and use
+        the current time as the inference timestamp**.
+
+        Examples
+        --------
+        **Related guide**: `How to set up monitoring <https://docs.openlayer.com/docs/set-up-monitoring>`_.
+
+        First, instantiate the client and retrieve an existing inference pipeline:
+
+        >>> import openlayer
+        >>>
+        >>> client = openlayer.OpenlayerClient('YOUR_API_KEY_HERE')
+        >>>
+        >>> project = client.load_project(name="Churn prediction")
+        >>>
+        >>> inference_pipeline = project.load_inference_pipeline(
+        ...     name="XGBoost model inference pipeline",
+        ... )
+
+        With the ``InferencePipeline`` object retrieved, you can publish a batch
+        of production data -- in this example, stored in a pandas dataframe
+        called ``df`` -- with:
+
+        >>> inference_pipeline.send_stream_data(
+        ...     batch_df=df,
+        ...     batch_config=config,
+        ... )
+        """
+        return self.client.send_stream_data(
+            *args,
+            inference_pipeline_id=self.id,
+            task_type=self.taskType,
+            **kwargs,
+        )
 
     def publish_batch_data(self, *args, **kwargs):
         """Publishes a batch of production data to the Openlayer platform.