Labelbox
diff --git a/‎examples/README.md‎
Lines changed: 0 additions & 1 deletion b/‎examples/README.md‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎examples/basics/custom_embeddings_sdk.ipynb‎
Lines changed: 159 additions & 50 deletions b/‎examples/basics/custom_embeddings_sdk.ipynb‎
Lines changed: 159 additions & 50 deletions
@@ -14,7 +14,6 @@
 | Data Rows         | [Github](basics/data_rows.ipynb)         | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Labelbox/labelbox-python/blob/develop/examples/basics/data_rows.ipynb)         |
 | Data Row Metadata | [Github](basics/data_row_metadata.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Labelbox/labelbox-python/blob/develop/examples/basics/data_row_metadata.ipynb) |
 | Custom Embeddings | [Github](basics/custom_embeddings_sdk.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Labelbox/labelbox-python/blob/develop/examples/basics/custom_embeddings_sdk.ipynb) |
-| Datasets          | [Github](basics/datasets.ipynb)          | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Labelbox/labelbox-python/blob/develop/examples/basics/datasets.ipynb)          |
 | Export data       | [Github](exports/export_data.ipynb)       | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Labelbox/labelbox-python/blob/develop/examples/exports/export_data.ipynb)       |
 | Ontologies        | [Github](basics/ontologies.ipynb)        | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Labelbox/labelbox-python/blob/develop/examples/basics/ontologies.ipynb)        |
 | Projects          | [Github](basics/projects.ipynb)          | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Labelbox/labelbox-python/blob/develop/examples/basics/projects.ipynb)          |
 
@@ -39,14 +39,14 @@
     {
       "metadata": {},
       "source": [
-        "# Setup"
+        "# Set up "
       ],
       "cell_type": "markdown"
     },
     {
       "metadata": {},
       "source": [
-        "!pip3 install -q \"labelbox\""
+        "!pip3 install -q \"labelbox[data]\""
       ],
       "cell_type": "code",
       "outputs": [],
@@ -57,12 +57,21 @@
       "source": [
         "import labelbox as lb\n",
         "import numpy as np\n",
-        "import json"
+        "import json\n",
+        "import uuid\n",
+        "import random"
       ],
       "cell_type": "code",
       "outputs": [],
       "execution_count": null
     },
+    {
+      "metadata": {},
+      "source": [
+        "# Replace with your API key"
+      ],
+      "cell_type": "markdown"
+    },
     {
       "metadata": {},
       "source": [
@@ -76,19 +85,31 @@
     {
       "metadata": {},
       "source": [
-        "# Select data rows in Labelbox for custom embeddings"
+        "# Select data rows"
       ],
       "cell_type": "markdown"
     },
     {
       "metadata": {},
       "source": [
-        "client.enable_experimental = True\n",
-        "\n",
-        "# get images from a Labelbox dataset\n",
-        "# Our systems start to process data after 1000 embeddings of each type, for this demo make sure your dataset is over 1000 data rows\n",
-        "dataset = client.get_dataset(\"<DATASET-ID>\")\n",
-        "\n",
+        "- Get images from a Labelbox dataset\n",
+        "- To improve similarity search, you need to upload custom embeddings to at least 1,000 data rows.\n"
+      ],
+      "cell_type": "markdown"
+    },
+    {
+      "metadata": {},
+      "source": [
+        "DATASET_ID = \"\""
+      ],
+      "cell_type": "code",
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "metadata": {},
+      "source": [
+        "dataset = client.get_dataset(dataset_id=DATASET_ID)\n",
         "export_task = dataset.export()\n",
         "export_task.wait_till_done()"
       ],
@@ -124,8 +145,8 @@
     {
       "metadata": {},
       "source": [
-        "data_row_ids = [dr[\"data_row\"][\"id\"] for dr in data_rows]\n",
-        "data_row_ids = data_row_ids[:1000] # keep the first 1000 examples for the sake of this demo"
+        "data_row_dict = [{\"data_row_id\": dr[\"data_row\"][\"id\"], \"row_data\": dr[\"data_row\"][\"row_data\"]} for dr in data_rows]\n",
+        "data_row_dict = data_row_dict[:1000] # keep the first 1000 examples for the sake of this demo"
       ],
       "cell_type": "code",
       "outputs": [],
@@ -134,19 +155,22 @@
     {
       "metadata": {},
       "source": [
-        "# Create the payload for custom embeddings\n",
-        "-- It should be a .ndjson file.   \n",
-        "-- Every line is a json file that finishes with a \\n character.  \n",
-        "-- It does not have to be created through Python.  "
+        "# Create custom embedding payload "
       ],
       "cell_type": "markdown"
     },
     {
       "metadata": {},
       "source": [
-        "nb_data_rows = len(data_row_ids)\n",
+        "Generate random vectors for embeddings (max : 2048 dimensions)"
+      ],
+      "cell_type": "markdown"
+    },
+    {
+      "metadata": {},
+      "source": [
+        "nb_data_rows = len(data_row_dict)\n",
         "print(\"Number of data rows: \", nb_data_rows)\n",
-        "# Generate random vectors, of dimension 2048 each\n",
         "# Labelbox supports custom embedding vectors of dimension up to 2048\n",
         "custom_embeddings = [list(np.random.random(2048)) for _ in range(nb_data_rows)]"
       ],
@@ -157,12 +181,14 @@
     {
       "metadata": {},
       "source": [
-        "# Create the payload for custom embeddings\n",
-        "payload = []\n",
-        "for data_row_id,custom_embedding in zip(data_row_ids,custom_embeddings):\n",
-        "  payload.append({\"id\": data_row_id, \"vector\": custom_embedding})\n",
-        "\n",
-        "print('payload', len(payload),payload[:1])"
+        "List all custom embeddings available in your Labelbox workspace"
+      ],
+      "cell_type": "markdown"
+    },
+    {
+      "metadata": {},
+      "source": [
+        "embeddings = client.get_embeddings()"
       ],
       "cell_type": "code",
       "outputs": [],
@@ -171,16 +197,15 @@
     {
       "metadata": {},
       "source": [
-        "# Delete any pre-existing file\n",
-        "import os\n",
-        "if os.path.exists(\"payload.ndjson\"):\n",
-        "  os.remove(\"payload.ndjson\")\n",
-        "\n",
-        "# Convert the payload to a JSON file\n",
-        "with open('payload.ndjson', 'w') as f:\n",
-        "  for p in payload:\n",
-        "    f.write(json.dumps(p) + \"\\n\")\n",
-        "    # sanity_check_payload = json.dump(payload, f)"
+        "Choose an existing embedding type or create a new one"
+      ],
+      "cell_type": "markdown"
+    },
+    {
+      "metadata": {},
+      "source": [
+        "# Name of the custom embedding must be unique\n",
+        "embedding = client.create_embedding(\"my_custom_embedding_2048_dimensions\", 2048)"
       ],
       "cell_type": "code",
       "outputs": [],
@@ -189,10 +214,26 @@
     {
       "metadata": {},
       "source": [
-        "# Sanity check that you can read/load the file and the payload is correct\n",
-        "with open('payload.ndjson') as f:\n",
-        "    sanity_check_payload = [json.loads(l) for l in f.readlines()]\n",
-        "print(\"Nb of custom embedding vectors in sanity_check_payload: \", len(sanity_check_payload))"
+        "Create payload"
+      ],
+      "cell_type": "markdown"
+    },
+    {
+      "metadata": {},
+      "source": [
+        "- The payload should encompass all the data you wish to retain, along with the new embeddings vector data.\n",
+        "- `row_data` and `key` is required when using `dataset.upsert_data_rows()` "
+      ],
+      "cell_type": "markdown"
+    },
+    {
+      "metadata": {},
+      "source": [
+        "payload = []\n",
+        "for data_row_dict, custom_embedding in zip(data_row_dict,custom_embeddings):\n",
+        "  payload.append({\"key\": lb.UniqueId(data_row_dict['data_row_id']), \"row_data\": data_row_dict['row_data'], \"embeddings\": [{\"embedding_id\": embedding.id, \"vector\": custom_embedding}]})\n",
+        "\n",
+        "print('payload', len(payload),payload[:1])"
       ],
       "cell_type": "code",
       "outputs": [],
@@ -201,8 +242,24 @@
     {
       "metadata": {},
       "source": [
-        "# See all custom embeddings available in your Labelbox workspace\n",
-        "embeddings = client.get_embeddings()"
+        "# Upload payload"
+      ],
+      "cell_type": "markdown"
+    },
+    {
+      "metadata": {},
+      "source": [
+        "Upsert data rows with custom embeddings"
+      ],
+      "cell_type": "markdown"
+    },
+    {
+      "metadata": {},
+      "source": [
+        "task = dataset.upsert_data_rows(payload)\n",
+        "task.wait_till_done()\n",
+        "print(task.errors)\n",
+        "print(task.status)"
       ],
       "cell_type": "code",
       "outputs": [],
@@ -211,9 +268,16 @@
     {
       "metadata": {},
       "source": [
-        "# Create a new custom embedding, unless you want to re-use one\n",
-        "# Name of the custom embedding must be unique\n",
-        "embedding = client.create_embedding(\"my_custom_embedding_2048_dimensions\", 2048)"
+        "Get the count of imported vectors for a custom embedding"
+      ],
+      "cell_type": "markdown"
+    },
+    {
+      "metadata": {},
+      "source": [
+        "# Count how many data rows have a specific custom embedding (this can take a couple of minutes)\n",
+        "count = embedding.get_imported_vector_count()\n",
+        "print(count)"
       ],
       "cell_type": "code",
       "outputs": [],
@@ -222,7 +286,13 @@
     {
       "metadata": {},
       "source": [
-        "# Delete a custom embedding\n",
+        "Delete custom embedding type"
+      ],
+      "cell_type": "markdown"
+    },
+    {
+      "metadata": {},
+      "source": [
         "#embedding.delete()"
       ],
       "cell_type": "code",
@@ -232,15 +302,22 @@
     {
       "metadata": {},
       "source": [
-        "# Upload the payload to Labelbox"
+        "# Upload custom embeddings during data row creation"
+      ],
+      "cell_type": "markdown"
+    },
+    {
+      "metadata": {},
+      "source": [
+        "Create a dataset"
       ],
       "cell_type": "markdown"
     },
     {
       "metadata": {},
       "source": [
-        "# Replace the current id with the newly generated id from the previous step, or any existing custom embedding id\n",
-        "embedding.import_vectors_from_file(\"./payload.ndjson\")"
+        "# Create a dataset\n",
+        "dataset_new = client.create_dataset(name=\"data_rows_with_embeddings\")"
       ],
       "cell_type": "code",
       "outputs": [],
@@ -249,15 +326,47 @@
     {
       "metadata": {},
       "source": [
-        "# Get the count of imported vectors for a custom embedding"
+        "Fetch an embedding (2048 dimension)"
       ],
       "cell_type": "markdown"
     },
     {
       "metadata": {},
       "source": [
-        "# Count how many data rows have a specific custom embedding (this can take a couple of minutes)\n",
-        "count = embedding.get_imported_vector_count()"
+        "embedding = client.get_embedding_by_name(\"my_custom_embedding_2048_dimensions\")\n",
+        "vector = [random.uniform(1.0, 2.0) for _ in range(embedding.dims)]"
+      ],
+      "cell_type": "code",
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "metadata": {},
+      "source": [
+        "Upload data rows with embeddings"
+      ],
+      "cell_type": "markdown"
+    },
+    {
+      "metadata": {},
+      "source": [
+        "\n",
+        "uploads = []\n",
+        "# Generate data rows\n",
+        "for i in range(1,9):\n",
+        "    uploads.append({\n",
+        "        \"row_data\":  f\"https://storage.googleapis.com/labelbox-datasets/People_Clothing_Segmentation/jpeg_images/IMAGES/img_000{i}.jpeg\",\n",
+        "        \"global_key\": \"TEST-ID-%id\" % uuid.uuid1(),\n",
+        "        \"embeddings\": [{\n",
+        "                    \"embedding_id\": embedding.id,\n",
+        "                    \"vector\": vector\n",
+        "                }]\n",
+        "    })\n",
+        "\n",
+        "task1 = dataset_new.create_data_rows(uploads)\n",
+        "task1.wait_till_done()\n",
+        "print(\"ERRORS: \" , task1.errors)\n",
+        "print(\"RESULTS:\" , task1.result)"
       ],
       "cell_type": "code",
       "outputs": [],