Add default columnNames for batches of data, tar dataset and config for batches of data, fix create inference pipeline bug, and fix monitoring notebook config

gustavocidornelas · whoseoyster · commit 29b12687c704 · 2023-10-09T20:34:37.000-07:00
diff --git a/examples/monitoring/quickstart/monitoring-quickstart.ipynb b/examples/monitoring/quickstart/monitoring-quickstart.ipynb
@@ -52,8 +52,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import openlayer\n",
     "from openlayer.tasks import TaskType\n",
+    "import openlayer\n",
     "\n",
     "client = openlayer.OpenlayerClient(\"YOUR_API_KEY_HERE\")\n",
     "project = client.create_or_load_project(\n",
@@ -83,6 +83,16 @@
     "# inference_pipeline = project.load_inference_pipeline(name=\"Production\")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "61e916c2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "inference_pipeline"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "39592b32",
@@ -227,8 +237,8 @@
    "outputs": [],
    "source": [
     "batch_1 = production_data.loc[:342]\n",
-    "batch_2 = production_data.loc[342:684]\n",
-    "batch_3 = production_data.loc[684:]"
+    "batch_2 = production_data.loc[343:684]\n",
+    "batch_3 = production_data.loc[686:]"
    ]
   },
   {
@@ -276,7 +286,8 @@
     "        \"Year\"\n",
     "    ],\n",
     "    \"timestampColumnName\": \"timestamp\",\n",
-    "    \"inferenceIdColumnName\": \"inference_id\"\n",
+    "    \"inferenceIdColumnName\": \"inference_id\",\n",
+    "    \"predictionsColumnName\": \"predictions\"\n",
     "}\n"
    ]
   },
diff --git a/openlayer/__init__.py b/openlayer/__init__.py
@@ -132,7 +132,6 @@ def create_project(
             warnings.warn(
                 f"Found an existing project with name '{name}'. Loading it instead."
             )
-            return project
         except exceptions.OpenlayerResourceNotFound:
             # Validate project
             project_config = {
@@ -168,7 +167,7 @@ def create_project(
             print(
                 f"Created your project. Navigate to {project.links['app']} to see it."
             )
-            return project
+        return project
 
     def load_project(self, name: str) -> Project:
         """Loads an existing project from the Openlayer platform.
@@ -956,7 +955,7 @@ def create_inference_pipeline(
                 "Created your inference pipeline. Navigate to"
                 f" {inference_pipeline.links['app']} to see it."
             )
-            return inference_pipeline
+        return inference_pipeline
 
     def load_inference_pipeline(
         self,
@@ -1019,6 +1018,9 @@ def upload_reference_dataset(
         dataset_data = DatasetSchema().load(
             {"task_type": task_type.value, **dataset_config}
         )
+        # Add default columns if not present
+        if dataset_data.get("columnNames") is None:
+            dataset_data["columnNames"] = utils.get_column_names(file_path)
 
         with tempfile.TemporaryDirectory() as tmp_dir:
             # Copy relevant files to tmp dir
@@ -1110,6 +1112,8 @@ def publish_batch_data(
         )
 
         # Add default columns if not present
+        if batch_data.get("columnNames") is None:
+            batch_data["columnNames"] = list(batch_df.columns)
         columns_to_add = {"timestampColumnName", "inferenceIdColumnName"}
         for column in columns_to_add:
             if batch_data.get(column) is None:
@@ -1123,6 +1127,12 @@ def publish_batch_data(
         with tempfile.TemporaryDirectory() as tmp_dir:
             # Copy save files to tmp dir
             batch_df.to_csv(f"{tmp_dir}/dataset.csv", index=False)
+            utils.write_yaml(batch_data, f"{tmp_dir}/dataset_config.yaml")
+
+            tar_file_path = os.path.join(tmp_dir, "tarfile")
+            with tarfile.open(tar_file_path, mode="w:gz") as tar:
+                tar.add(tmp_dir, arcname=os.path.basename("batch_data"))
+
             payload = {
                 "earliestTimestamp": int(earliest_timestamp),
                 "latestTimestamp": int(latest_timestamp),
@@ -1132,8 +1142,8 @@ def publish_batch_data(
 
             self.api.upload(
                 endpoint=f"inference-pipelines/{inference_pipeline_id}/data",
-                file_path=f"{tmp_dir}/dataset.csv",
-                object_name="dataset.csv",
+                file_path=tar_file_path,
+                object_name="tarfile",
                 body=payload,
                 storage_uri_key="storageUri",
                 method="POST",