[SN-122]Update import notebook for image data (#1501)

smuttalb · web-flow · commit 5e9b57c316c1 · 2024-03-22T22:32:48.000+05:30
diff --git a/examples/annotation_import/import_labeled_dataset_image.ipynb b/examples/annotation_import/import_labeled_dataset_image.ipynb
@@ -0,0 +1,315 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {},
+  "cells": [
+    {
+      "metadata": {},
+      "source": [
+        "!pip install -q \"labelbox[data]\""
+      ],
+      "cell_type": "code",
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "metadata": {},
+      "source": [
+        "import labelbox as lb\n",
+        "from labelbox.schema.data_row_metadata import DataRowMetadataField, DataRowMetadataKind\n",
+        "import datetime\n",
+        "import random\n",
+        "import os\n",
+        "import json\n",
+        "from PIL import Image\n",
+        "from labelbox.schema.ontology import OntologyBuilder, Tool\n",
+        "import requests\n",
+        "from tqdm.notebook import tqdm\n",
+        "import uuid\n",
+        "from labelbox.data.annotation_types import Label, ImageData, ObjectAnnotation, Rectangle, Point"
+      ],
+      "cell_type": "code",
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "metadata": {},
+      "source": [
+        "# Setup Labelbox client"
+      ],
+      "cell_type": "markdown"
+    },
+    {
+      "metadata": {},
+      "source": [
+        "# Initialize the Labelbox client\n",
+        "API_KEY = \"\" # Place API key\n",
+        "client = lb.Client(API_KEY)"
+      ],
+      "cell_type": "code",
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "metadata": {},
+      "source": [
+        "# Download a public dataset\n"
+      ],
+      "cell_type": "markdown"
+    },
+    {
+      "metadata": {},
+      "source": [
+        "# Function to download files\n",
+        "def download_files(filemap):\n",
+        "    path, uri = filemap\n",
+        "    if not os.path.exists(path):\n",
+        "        response = requests.get(uri, stream=True)\n",
+        "        with open(path, 'wb') as f:\n",
+        "            for chunk in response.iter_content(chunk_size=8192):\n",
+        "                f.write(chunk)\n",
+        "    return path"
+      ],
+      "cell_type": "code",
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "metadata": {},
+      "source": [
+        "# Download data rows and annotations\n",
+        "DATA_ROWS_URL = \"https://storage.googleapis.com/labelbox-datasets/VHR_geospatial/geospatial_datarows.json\"\n",
+        "ANNOTATIONS_URL = \"https://storage.googleapis.com/labelbox-datasets/VHR_geospatial/geospatial_annotations.json\"\n",
+        "download_files((\"data_rows.json\", DATA_ROWS_URL))\n",
+        "download_files((\"annotations.json\", ANNOTATIONS_URL))"
+      ],
+      "cell_type": "code",
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "metadata": {},
+      "source": [
+        "# Load data rows and annotations\n",
+        "with open('data_rows.json') as fp:\n",
+        "    data_rows = json.load(fp)\n",
+        "with open('annotations.json') as fp:\n",
+        "    annotations = json.load(fp)"
+      ],
+      "cell_type": "code",
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "metadata": {},
+      "source": [
+        "# Create a dataset"
+      ],
+      "cell_type": "markdown"
+    },
+    {
+      "metadata": {},
+      "source": [
+        "# Create a new dataset\n",
+        "dataset = client.create_dataset(name=\"Geospatial vessel detection\")\n",
+        "print(f\"Created dataset with ID: {dataset.uid}\")"
+      ],
+      "cell_type": "code",
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "metadata": {},
+      "source": [
+        "# Import Data Rows with Metadata"
+      ],
+      "cell_type": "markdown"
+    },
+    {
+      "metadata": {},
+      "source": [
+        "# Here is an example of adding two metadata fields to your Data Rows: a \"captureDateTime\" field with datetime value, and a \"tag\" field with string value\n",
+        "metadata_ontology = client.get_data_row_metadata_ontology()\n",
+        "datetime_schema_id = metadata_ontology.reserved_by_name[\"captureDateTime\"].uid\n",
+        "tag_schema_id = metadata_ontology.reserved_by_name[\"tag\"].uid\n",
+        "tag_items = [\"WorldView-1\", \"WorldView-2\", \"WorldView-3\", \"WorldView-4\"]\n",
+        "\n",
+        "for datarow in tqdm(data_rows):\n",
+        "    dt = datetime.datetime.utcnow() + datetime.timedelta(days=random.random()*30) # this is random datetime value\n",
+        "    tag_item = random.choice(tag_items) # this is a random tag value\n",
+        "\n",
+        "    # Option 1: Specify metadata with a list of DataRowMetadataField. This is the recommended option since it comes with validation for metadata fields.\n",
+        "    metadata_fields = [\n",
+        "                       DataRowMetadataField(schema_id=datetime_schema_id, value=dt),\n",
+        "                       DataRowMetadataField(schema_id=tag_schema_id, value=tag_item)\n",
+        "                       ]\n",
+        "\n",
+        "    # Option 2: Uncomment to try. Alternatively, you can specify the metadata fields with dictionary format without declaring the DataRowMetadataField objects. It is equivalent to Option 1.\n",
+        "    # metadata_fields = [\n",
+        "    #                    {\"schema_id\": datetime_schema_id, \"value\": dt},\n",
+        "    #                    {\"schema_id\": tag_schema_id, \"value\": tag_item}\n",
+        "    #                    ]\n",
+        "\n",
+        "    datarow[\"metadata_fields\"] = metadata_fields"
+      ],
+      "cell_type": "code",
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "metadata": {},
+      "source": [
+        "task = dataset.create_data_rows(data_rows)\n",
+        "task.wait_till_done()\n",
+        "print(f\"Failed data rows: {task.failed_data_rows}\")\n",
+        "print(f\"Errors: {task.errors}\")\n",
+        "\n",
+        "if task.errors:\n",
+        "    for error in task.errors:\n",
+        "        if 'Duplicate global key' in error['message'] and dataset.row_count == 0:\n",
+        "            # If the global key already  exists in the workspace the dataset will be created empty, so we can delete it.\n",
+        "            print(f\"Deleting empty dataset: {dataset}\")\n",
+        "            dataset.delete()"
+      ],
+      "cell_type": "code",
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "metadata": {},
+      "source": [
+        "Examine a Data Row"
+      ],
+      "cell_type": "markdown"
+    },
+    {
+      "metadata": {},
+      "source": [
+        "datarow = next(dataset.data_rows())\n",
+        "print(datarow)"
+      ],
+      "cell_type": "code",
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "metadata": {},
+      "source": [
+        "# Setup a labeling project"
+      ],
+      "cell_type": "markdown"
+    },
+    {
+      "metadata": {},
+      "source": [
+        "# Initialize the OntologyBuilder\n",
+        "ontology_builder = OntologyBuilder()\n",
+        "\n",
+        "# Assuming 'annotations' is defined and contains the necessary data\n",
+        "for category in annotations['categories']:\n",
+        "    print(category['name'])\n",
+        "    # Add tools to the ontology builder\n",
+        "    ontology_builder.add_tool(Tool(tool=Tool.Type.BBOX, name=category['name']))\n",
+        "\n",
+        "# Create the ontology in Labelbox\n",
+        "ontology = client.create_ontology(\"Vessel Detection Ontology\",\n",
+        "                                  ontology_builder.asdict(),\n",
+        "                                  media_type=lb.MediaType.Image)\n",
+        "print(f\"Created ontology with ID: {ontology.uid}\")\n",
+        "\n",
+        "# Create a project and set up the ontology\n",
+        "project = client.create_project(name=\"Vessel Detection\", media_type=lb.MediaType.Image)\n",
+        "project.setup_editor(ontology=ontology)\n",
+        "print(f\"Created project with ID: {project.uid}\")"
+      ],
+      "cell_type": "code",
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "metadata": {},
+      "source": [
+        "# Export data rows from the dataset\n",
+        "data_rows = [dr.uid for dr in dataset.export_data_rows()]\n",
+        "\n",
+        "# Randomly select 200 Data Rows (or fewer if the dataset has less than 200 data rows)\n",
+        "sampled_data_rows = random.sample(data_rows, min(len(data_rows), 200))\n",
+        "\n",
+        "# Create a new batch in the project and add the sampled data rows\n",
+        "batch = project.create_batch(\n",
+        "    \"Initial batch\",  # name of the batch\n",
+        "    sampled_data_rows,  # list of Data Rows\n",
+        "    1  # priority between 1-5\n",
+        ")\n",
+        "print(f\"Created batch with ID: {batch.uid}\")"
+      ],
+      "cell_type": "code",
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "metadata": {},
+      "source": [
+        "queued_data_rows = project.export_queued_data_rows()\n",
+        "labels = []\n",
+        "\n",
+        "for datarow in queued_data_rows:\n",
+        "    annotations_list = []\n",
+        "    folder = datarow['externalId'].split(\"/\")[0]\n",
+        "    id = datarow['externalId'].split(\"/\")[1]\n",
+        "    if folder == \"positive_image_set\":\n",
+        "        for image in annotations['images']:\n",
+        "            if image['file_name'] == id:\n",
+        "                for annotation in annotations['annotations']:\n",
+        "                    if annotation['image_id'] == image['id']:\n",
+        "                        bbox = annotation['bbox']\n",
+        "                        category_id = annotation['category_id'] - 1\n",
+        "                        class_name = None\n",
+        "                        ontology = ontology_builder.asdict()  # Get the ontology dictionary\n",
+        "                        for category in ontology['tools']:\n",
+        "                            if category['name'] == annotations['categories'][category_id]['name']:\n",
+        "                                class_name = category['name']\n",
+        "                                break\n",
+        "                        if class_name:\n",
+        "                            annotations_list.append(ObjectAnnotation(\n",
+        "                                name=class_name,\n",
+        "                                value=Rectangle(start=Point(x=bbox[0], y=bbox[1]), end=Point(x=bbox[2]+bbox[0], y=bbox[3]+bbox[1]))\n",
+        "                            ))\n",
+        "    image_data = ImageData(uid=datarow['id'])\n",
+        "    labels.append(Label(data=image_data, annotations=annotations_list))\n"
+      ],
+      "cell_type": "code",
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "metadata": {},
+      "source": [
+        "print(labels)"
+      ],
+      "cell_type": "code",
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "metadata": {},
+      "source": [
+        "upload_job = lb.LabelImport.create_from_objects(\n",
+        "    client=client,\n",
+        "    project_id=project.uid,\n",
+        "    name=f\"label_import_job_{str(uuid.uuid4())}\",\n",
+        "    labels=labels\n",
+        ")\n",
+        "\n",
+        "# Wait for the upload to finish and print the results\n",
+        "upload_job.wait_until_done()\n",
+        "\n",
+        "print(f\"Errors: {upload_job.errors}\")\n",
+        "print(f\"Status of uploads: {upload_job.statuses}\")"
+      ],
+      "cell_type": "code",
+      "outputs": [],
+      "execution_count": null
+    }
+  ]
+}