feat: upload a reference dataset

whoseoyster · whoseoyster · commit eff6bf0a1d3a · 2024-07-22T21:35:32.000-07:00
diff --git a/examples/monitoring/upload_reference_dataset.py b/examples/monitoring/upload_reference_dataset.py
@@ -0,0 +1,55 @@
+import os
+
+import pandas as pd
+from openlayer import Openlayer
+from openlayer.lib import data
+from openlayer.types.inference_pipelines import data_stream_params
+
+os.environ["OPENLAYER_API_KEY"] = "YOUR_API_KEY"
+pipeline_id = "YOUR_INFERENCE_PIPELINE_ID"
+
+df = pd.DataFrame(
+    {
+        "CreditScore": [600],
+        "Geography": ["France"],
+        "Gender": ["Male"],
+        "Age": [40],
+        "Tenure": [5],
+        "Balance": [100000],
+        "NumOfProducts": [1],
+        "HasCrCard": [1],
+        "IsActiveMember": [1],
+        "EstimatedSalary": [50000],
+        "AggregateRate": [0.5],
+        "Year": [2020],
+        "Exited": [0],
+    }
+)
+
+config = data_stream_params.ConfigTabularClassificationData(
+    categorical_feature_names=["Gender", "Geography"],
+    class_names=["Retained", "Exited"],
+    feature_names=[
+        "CreditScore",
+        "Geography",
+        "Gender",
+        "Age",
+        "Tenure",
+        "Balance",
+        "NumOfProducts",
+        "HasCrCard",
+        "IsActiveMember",
+        "EstimatedSalary",
+        "AggregateRate",
+        "Year",
+    ],
+    label_column_name="Exited",
+)
+
+data.upload_reference_dataframe(
+    client=Openlayer(),
+    inference_pipeline_id=pipeline_id,
+    dataset_df=df,
+    config=config,
+    storage_type=data.StorageType.FS,
+)
diff --git a/examples/rest-api/development_test_results.py b/examples/rest-api/development_test_results.py
@@ -9,6 +9,6 @@
     # This is the default and can be omitted
     api_key=os.environ.get("OPENLAYER_API_KEY"),
 )
-response = client.commits.test_results.list(id=commit_id)
+response = client.commits.test_results.list(commit_id=commit_id)
 
 print(response.items)
diff --git a/examples/rest-api/monitoring_test_results.py b/examples/rest-api/monitoring_test_results.py
@@ -9,6 +9,6 @@
     # This is the default and can be omitted
     api_key=os.environ.get("OPENLAYER_API_KEY"),
 )
-response = client.inference_pipelines.test_results.list(id=inference_pipeline_id)
+response = client.inference_pipelines.test_results.list(inference_pipeline_id=inference_pipeline_id)
 
 print(response.items)
diff --git a/examples/rest-api/stream_data.py b/examples/rest-api/stream_data.py
@@ -2,6 +2,10 @@
 
 from openlayer import Openlayer
 
+# Prepare the config for the data, which depends on your project's task type. In this
+# case, we have an LLM project:
+from openlayer.types.inference_pipelines import data_stream_params
+
 # Let's say we want to stream the following row, which represents a model prediction:
 data = {"user_query": "what's the meaning of life?", "output": "42", "tokens": 7, "cost": 0.02, "timestamp": 1620000000}
 
@@ -10,10 +14,6 @@
     api_key=os.environ.get("OPENLAYER_API_KEY"),
 )
 
-# Prepare the config for the data, which depends on your project's task type. In this
-# case, we have an LLM project:
-from openlayer.types.inference_pipelines import data_stream_params
-
 config = data_stream_params.ConfigLlmData(
     input_variable_names=["user_query"],
     output_column_name="output",
@@ -25,7 +25,7 @@
 
 
 data_stream_response = client.inference_pipelines.data.stream(
-    id="YOUR_INFERENCE_PIPELINE_ID",
+    inference_pipeline_id="YOUR_INFERENCE_PIPELINE_ID",
     rows=[data],
     config=config,
 )
diff --git a/examples/tracing/anthropic/anthropic_tracing.ipynb b/examples/tracing/anthropic/anthropic_tracing.ipynb
@@ -95,8 +95,7 @@
     "response = anthropic_client.messages.create(\n",
     "    model=\"claude-3-opus-20240229\",\n",
     "    max_tokens=1024,\n",
-    "    messages=[\n",
-    "        {\"role\": \"user\", \"content\": \"How are you doing today?\"}],\n",
+    "    messages=[{\"role\": \"user\", \"content\": \"How are you doing today?\"}],\n",
     ")"
    ]
   },
diff --git a/examples/tracing/azure-openai/azure_openai_tracing.ipynb b/examples/tracing/azure-openai/azure_openai_tracing.ipynb
@@ -106,7 +106,7 @@
     "    model=os.environ.get(\"AZURE_OPENAI_DEPLOYMENT_NAME\"),\n",
     "    messages=[\n",
     "        {\"role\": \"user\", \"content\": \"How are you doing today?\"},\n",
-    "    ]\n",
+    "    ],\n",
     ")"
    ]
   },
diff --git a/examples/tracing/openai-assistant/openai_assistant_tracing.ipynb b/examples/tracing/openai-assistant/openai_assistant_tracing.ipynb
@@ -102,8 +102,8 @@
     "thread = openai_client.beta.threads.create(\n",
     "    messages=[\n",
     "        {\n",
-    "        \"role\": \"user\",\n",
-    "        \"content\": \"Create a data visualization of the american GDP.\",\n",
+    "            \"role\": \"user\",\n",
+    "            \"content\": \"Create a data visualization of the american GDP.\",\n",
     "        }\n",
     "    ]\n",
     ")"
@@ -117,10 +117,7 @@
    "outputs": [],
    "source": [
     "# Run assistant on thread\n",
-    "run = openai_client.beta.threads.runs.create(\n",
-    "    thread_id=thread.id,\n",
-    "    assistant_id=assistant.id\n",
-    ")"
+    "run = openai_client.beta.threads.runs.create(thread_id=thread.id, assistant_id=assistant.id)"
    ]
   },
   {
diff --git a/examples/tracing/openai/openai_tracing.ipynb b/examples/tracing/openai/openai_tracing.ipynb
@@ -93,10 +93,7 @@
    "outputs": [],
    "source": [
     "completion = openai_client.chat.completions.create(\n",
-    "    model=\"gpt-3.5-turbo\",\n",
-    "    messages=[\n",
-    "        {\"role\": \"user\", \"content\": \"How are you doing today?\"}\n",
-    "    ]\n",
+    "    model=\"gpt-3.5-turbo\", messages=[{\"role\": \"user\", \"content\": \"How are you doing today?\"}]\n",
     ")"
    ]
   },
diff --git a/examples/tracing/rag/rag_tracing.ipynb b/examples/tracing/rag/rag_tracing.ipynb
@@ -78,12 +78,12 @@
    "source": [
     "class RagPipeline:\n",
     "    def __init__(self, context_path: str):\n",
-    "        # Wrap OpenAI client with Openlayer's `trace_openai` to trace it \n",
+    "        # Wrap OpenAI client with Openlayer's `trace_openai` to trace it\n",
     "        self.openai_client = trace_openai(OpenAI())\n",
-    "        \n",
+    "\n",
     "        self.vectorizer = TfidfVectorizer()\n",
-    "        with open(context_path, 'r', encoding='utf-8') as file:\n",
-    "            self.context_sections = file.read().split('\\n\\n')  \n",
+    "        with open(context_path, \"r\", encoding=\"utf-8\") as file:\n",
+    "            self.context_sections = file.read().split(\"\\n\\n\")\n",
     "        self.tfidf_matrix = self.vectorizer.fit_transform(self.context_sections)\n",
     "\n",
     "    # Decorate the functions you'd like to trace with @trace()\n",
@@ -100,8 +100,8 @@
     "\n",
     "    @trace()\n",
     "    def retrieve_context(self, query: str) -> str:\n",
-    "        \"\"\"Context retriever. \n",
-    "        \n",
+    "        \"\"\"Context retriever.\n",
+    "\n",
     "        Given the query, returns the most similar context (using TFIDF).\n",
     "        \"\"\"\n",
     "        query_vector = self.vectorizer.transform([query])\n",
@@ -115,7 +115,10 @@
     "        the prompt (formatted to conform with OpenAI models).\"\"\"\n",
     "        return [\n",
     "            {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
-    "            {\"role\": \"user\", \"content\": f\"Answer the user query using only the following context: {context}. \\nUser query: {query}\"}\n",
+    "            {\n",
+    "                \"role\": \"user\",\n",
+    "                \"content\": f\"Answer the user query using only the following context: {context}. \\nUser query: {query}\",\n",
+    "            },\n",
     "        ]\n",
     "\n",
     "    @trace()\n",
diff --git a/src/openlayer/lib/core/metrics.py b/src/openlayer/lib/core/metrics.py
@@ -7,7 +7,7 @@
 import json
 import os
 from dataclasses import asdict, dataclass, field
-from typing import Any, Dict, List, Optional, Union, Set
+from typing import Any, Dict, List, Optional, Set, Union
 
 import pandas as pd
 
diff --git a/src/openlayer/lib/data/__init__.py b/src/openlayer/lib/data/__init__.py
@@ -0,0 +1,6 @@
+"""Data upload functions."""
+
+__all__ = ["upload_reference_dataframe", "StorageType"]
+
+from ._upload import StorageType
+from .reference_dataset import upload_reference_dataframe
diff --git a/src/openlayer/lib/data/_upload.py b/src/openlayer/lib/data/_upload.py
@@ -0,0 +1,179 @@
+"""Data upload helpers.
+
+This module defines an interface to upload large amounts of data to
+different storage backends.
+"""
+
+import os
+import shutil
+from enum import Enum
+from typing import Optional
+
+import requests
+from requests.adapters import Response
+from requests_toolbelt import MultipartEncoder, MultipartEncoderMonitor
+from tqdm import tqdm
+from tqdm.utils import CallbackIOWrapper
+
+from ... import _exceptions
+from ..._client import Openlayer
+from ...types.storage import PresignedURLCreateResponse
+
+
+class StorageType(Enum):
+    """Storage options for uploads."""
+
+    FS = "local"
+    AWS = "s3"
+    GCP = "gcs"
+    AZURE = "azure"
+
+
+STORAGE = StorageType.AWS
+REQUESTS_TIMEOUT = 60 * 60 * 3  # 3 hours
+# Controls the `verify` parameter on requests in case a custom
+# certificate is needed or needs to be disabled altogether
+VERIFY_REQUESTS = True
+
+
+class Uploader:
+    """Internal class to handle http requests"""
+
+    def __init__(self, client: Openlayer, storage: Optional[StorageType] = None):
+        self.client = client
+        self.storage = storage or STORAGE
+
+    @staticmethod
+    def _raise_on_respose(res: Response):
+        try:
+            message = res.json().get("error", res.text)
+        except ValueError:
+            message = res.text
+
+        raise _exceptions.OpenlayerError(message)
+
+    def upload(
+        self,
+        file_path: str,
+        object_name: str,
+        presigned_url_response: PresignedURLCreateResponse,
+    ):
+        """Generic method to upload data to the default storage medium and create the
+        appropriate resource in the backend.
+        """
+        if self.storage == StorageType.AWS:
+            return self.upload_blob_s3(
+                file_path=file_path,
+                object_name=object_name,
+                presigned_url_response=presigned_url_response,
+            )
+        elif self.storage == StorageType.GCP:
+            return self.upload_blob_gcs(
+                file_path=file_path,
+                presigned_url_response=presigned_url_response,
+            )
+        elif self.storage == StorageType.AZURE:
+            return self.upload_blob_azure(
+                file_path=file_path,
+                presigned_url_response=presigned_url_response,
+            )
+        else:
+            return self.transfer_blob(
+                file_path=file_path,
+                presigned_url_response=presigned_url_response,
+            )
+
+    def upload_blob_s3(
+        self,
+        file_path: str,
+        object_name: str,
+        presigned_url_response: PresignedURLCreateResponse = None,
+    ):
+        """Generic method to upload data to S3 storage and create the appropriate
+        resource in the backend.
+        """
+
+        with tqdm(
+            total=os.stat(file_path).st_size,
+            unit="B",
+            unit_scale=True,
+            unit_divisor=1024,
+            colour="BLUE",
+        ) as t:
+            with open(file_path, "rb") as f:
+                # Avoid logging here as it will break the progress bar
+                fields = presigned_url_response.fields
+                fields["file"] = (object_name, f, "application/x-tar")
+                e = MultipartEncoder(fields=fields)
+                m = MultipartEncoderMonitor(e, lambda monitor: t.update(min(t.total, monitor.bytes_read) - t.n))
+                headers = {"Content-Type": m.content_type}
+                res = requests.post(
+                    presigned_url_response.url,
+                    data=m,
+                    headers=headers,
+                    verify=VERIFY_REQUESTS,
+                    timeout=REQUESTS_TIMEOUT,
+                )
+        return res
+
+    def upload_blob_gcs(self, file_path: str, presigned_url_response: PresignedURLCreateResponse):
+        """Generic method to upload data to Google Cloud Storage and create the
+        appropriate resource in the backend.
+        """
+        with open(file_path, "rb") as f:
+            with tqdm(
+                total=os.stat(file_path).st_size,
+                unit="B",
+                unit_scale=True,
+                unit_divisor=1024,
+            ) as t:
+                wrapped_file = CallbackIOWrapper(t.update, f, "read")
+                res = requests.put(
+                    presigned_url_response.url,
+                    data=wrapped_file,
+                    headers={"Content-Type": "application/x-gzip"},
+                    verify=VERIFY_REQUESTS,
+                    timeout=REQUESTS_TIMEOUT,
+                )
+        return res
+
+    def upload_blob_azure(self, file_path: str, presigned_url_response: PresignedURLCreateResponse):
+        """Generic method to upload data to Azure Blob Storage and create the
+        appropriate resource in the backend.
+        """
+        with open(file_path, "rb") as f:
+            with tqdm(
+                total=os.stat(file_path).st_size,
+                unit="B",
+                unit_scale=True,
+                unit_divisor=1024,
+            ) as t:
+                wrapped_file = CallbackIOWrapper(t.update, f, "read")
+                res = requests.put(
+                    presigned_url_response.url,
+                    data=wrapped_file,
+                    headers={
+                        "Content-Type": "application/x-gzip",
+                        "x-ms-blob-type": "BlockBlob",
+                    },
+                    verify=VERIFY_REQUESTS,
+                    timeout=REQUESTS_TIMEOUT,
+                )
+        return res
+
+    def transfer_blob(
+        self,
+        file_path: str,
+        presigned_url_response: PresignedURLCreateResponse,
+    ):
+        """Generic method to transfer data to the openlayer folder and create the
+        appropriate resource in the backend when using a local deployment.
+        """
+        blob_path = presigned_url_response.storage_uri.replace("local://", "")
+        dir_path = os.path.dirname(blob_path)
+        try:
+            os.makedirs(dir_path, exist_ok=True)
+        except OSError as exc:
+            raise _exceptions.OpenlayerError(f"Directory {dir_path} cannot be created") from exc
+        shutil.copyfile(file_path, blob_path)
+        return None
diff --git a/src/openlayer/lib/data/reference_dataset.py b/src/openlayer/lib/data/reference_dataset.py
diff --git a/src/openlayer/lib/integrations/anthropic_tracer.py b/src/openlayer/lib/integrations/anthropic_tracer.py
diff --git a/src/openlayer/lib/utils.py b/src/openlayer/lib/utils.py

Original file line number	Diff line number	Diff line change
`@@ -9,6 +9,6 @@`
`9`	`9`	`# This is the default and can be omitted`
`10`	`10`	`api_key=os.environ.get("OPENLAYER_API_KEY"),`
`11`	`11`	`)`
`12`		`-response = client.commits.test_results.list(id=commit_id)`
	`12`	`+response = client.commits.test_results.list(commit_id=commit_id)`
`13`	`13`
`14`	`14`	`print(response.items)`
Original file line number	Diff line number	Diff line change
`@@ -106,7 +106,7 @@`
`106`	`106`	`" model=os.environ.get(\"AZURE_OPENAI_DEPLOYMENT_NAME\"),\n",`
`107`	`107`	`" messages=[\n",`
`108`	`108`	`" {\"role\": \"user\", \"content\": \"How are you doing today?\"},\n",`
`109`		`- " ]\n",`
	`109`	`+ " ],\n",`
`110`	`110`	`")"`
`111`	`111`	`]`
`112`	`112`	`},`