Skip to content

Commit 5e9b57c

Browse files
authored
[SN-122]Update import notebook for image data (#1501)
2 parents bcf2fb1 + ea5ecca commit 5e9b57c

File tree

1 file changed

+315
-0
lines changed

1 file changed

+315
-0
lines changed
Lines changed: 315 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,315 @@
1+
{
2+
"nbformat": 4,
3+
"nbformat_minor": 0,
4+
"metadata": {},
5+
"cells": [
6+
{
7+
"metadata": {},
8+
"source": [
9+
"!pip install -q \"labelbox[data]\""
10+
],
11+
"cell_type": "code",
12+
"outputs": [],
13+
"execution_count": null
14+
},
15+
{
16+
"metadata": {},
17+
"source": [
18+
"import labelbox as lb\n",
19+
"from labelbox.schema.data_row_metadata import DataRowMetadataField, DataRowMetadataKind\n",
20+
"import datetime\n",
21+
"import random\n",
22+
"import os\n",
23+
"import json\n",
24+
"from PIL import Image\n",
25+
"from labelbox.schema.ontology import OntologyBuilder, Tool\n",
26+
"import requests\n",
27+
"from tqdm.notebook import tqdm\n",
28+
"import uuid\n",
29+
"from labelbox.data.annotation_types import Label, ImageData, ObjectAnnotation, Rectangle, Point"
30+
],
31+
"cell_type": "code",
32+
"outputs": [],
33+
"execution_count": null
34+
},
35+
{
36+
"metadata": {},
37+
"source": [
38+
"# Setup Labelbox client"
39+
],
40+
"cell_type": "markdown"
41+
},
42+
{
43+
"metadata": {},
44+
"source": [
45+
"# Initialize the Labelbox client\n",
46+
"API_KEY = \"\" # Place API key\n",
47+
"client = lb.Client(API_KEY)"
48+
],
49+
"cell_type": "code",
50+
"outputs": [],
51+
"execution_count": null
52+
},
53+
{
54+
"metadata": {},
55+
"source": [
56+
"# Download a public dataset\n"
57+
],
58+
"cell_type": "markdown"
59+
},
60+
{
61+
"metadata": {},
62+
"source": [
63+
"# Function to download files\n",
64+
"def download_files(filemap):\n",
65+
" path, uri = filemap\n",
66+
" if not os.path.exists(path):\n",
67+
" response = requests.get(uri, stream=True)\n",
68+
" with open(path, 'wb') as f:\n",
69+
" for chunk in response.iter_content(chunk_size=8192):\n",
70+
" f.write(chunk)\n",
71+
" return path"
72+
],
73+
"cell_type": "code",
74+
"outputs": [],
75+
"execution_count": null
76+
},
77+
{
78+
"metadata": {},
79+
"source": [
80+
"# Download data rows and annotations\n",
81+
"DATA_ROWS_URL = \"https://storage.googleapis.com/labelbox-datasets/VHR_geospatial/geospatial_datarows.json\"\n",
82+
"ANNOTATIONS_URL = \"https://storage.googleapis.com/labelbox-datasets/VHR_geospatial/geospatial_annotations.json\"\n",
83+
"download_files((\"data_rows.json\", DATA_ROWS_URL))\n",
84+
"download_files((\"annotations.json\", ANNOTATIONS_URL))"
85+
],
86+
"cell_type": "code",
87+
"outputs": [],
88+
"execution_count": null
89+
},
90+
{
91+
"metadata": {},
92+
"source": [
93+
"# Load data rows and annotations\n",
94+
"with open('data_rows.json') as fp:\n",
95+
" data_rows = json.load(fp)\n",
96+
"with open('annotations.json') as fp:\n",
97+
" annotations = json.load(fp)"
98+
],
99+
"cell_type": "code",
100+
"outputs": [],
101+
"execution_count": null
102+
},
103+
{
104+
"metadata": {},
105+
"source": [
106+
"# Create a dataset"
107+
],
108+
"cell_type": "markdown"
109+
},
110+
{
111+
"metadata": {},
112+
"source": [
113+
"# Create a new dataset\n",
114+
"dataset = client.create_dataset(name=\"Geospatial vessel detection\")\n",
115+
"print(f\"Created dataset with ID: {dataset.uid}\")"
116+
],
117+
"cell_type": "code",
118+
"outputs": [],
119+
"execution_count": null
120+
},
121+
{
122+
"metadata": {},
123+
"source": [
124+
"# Import Data Rows with Metadata"
125+
],
126+
"cell_type": "markdown"
127+
},
128+
{
129+
"metadata": {},
130+
"source": [
131+
"# Here is an example of adding two metadata fields to your Data Rows: a \"captureDateTime\" field with datetime value, and a \"tag\" field with string value\n",
132+
"metadata_ontology = client.get_data_row_metadata_ontology()\n",
133+
"datetime_schema_id = metadata_ontology.reserved_by_name[\"captureDateTime\"].uid\n",
134+
"tag_schema_id = metadata_ontology.reserved_by_name[\"tag\"].uid\n",
135+
"tag_items = [\"WorldView-1\", \"WorldView-2\", \"WorldView-3\", \"WorldView-4\"]\n",
136+
"\n",
137+
"for datarow in tqdm(data_rows):\n",
138+
" dt = datetime.datetime.utcnow() + datetime.timedelta(days=random.random()*30) # this is random datetime value\n",
139+
" tag_item = random.choice(tag_items) # this is a random tag value\n",
140+
"\n",
141+
" # Option 1: Specify metadata with a list of DataRowMetadataField. This is the recommended option since it comes with validation for metadata fields.\n",
142+
" metadata_fields = [\n",
143+
" DataRowMetadataField(schema_id=datetime_schema_id, value=dt),\n",
144+
" DataRowMetadataField(schema_id=tag_schema_id, value=tag_item)\n",
145+
" ]\n",
146+
"\n",
147+
" # Option 2: Uncomment to try. Alternatively, you can specify the metadata fields with dictionary format without declaring the DataRowMetadataField objects. It is equivalent to Option 1.\n",
148+
" # metadata_fields = [\n",
149+
" # {\"schema_id\": datetime_schema_id, \"value\": dt},\n",
150+
" # {\"schema_id\": tag_schema_id, \"value\": tag_item}\n",
151+
" # ]\n",
152+
"\n",
153+
" datarow[\"metadata_fields\"] = metadata_fields"
154+
],
155+
"cell_type": "code",
156+
"outputs": [],
157+
"execution_count": null
158+
},
159+
{
160+
"metadata": {},
161+
"source": [
162+
"task = dataset.create_data_rows(data_rows)\n",
163+
"task.wait_till_done()\n",
164+
"print(f\"Failed data rows: {task.failed_data_rows}\")\n",
165+
"print(f\"Errors: {task.errors}\")\n",
166+
"\n",
167+
"if task.errors:\n",
168+
" for error in task.errors:\n",
169+
" if 'Duplicate global key' in error['message'] and dataset.row_count == 0:\n",
170+
" # If the global key already exists in the workspace the dataset will be created empty, so we can delete it.\n",
171+
" print(f\"Deleting empty dataset: {dataset}\")\n",
172+
" dataset.delete()"
173+
],
174+
"cell_type": "code",
175+
"outputs": [],
176+
"execution_count": null
177+
},
178+
{
179+
"metadata": {},
180+
"source": [
181+
"Examine a Data Row"
182+
],
183+
"cell_type": "markdown"
184+
},
185+
{
186+
"metadata": {},
187+
"source": [
188+
"datarow = next(dataset.data_rows())\n",
189+
"print(datarow)"
190+
],
191+
"cell_type": "code",
192+
"outputs": [],
193+
"execution_count": null
194+
},
195+
{
196+
"metadata": {},
197+
"source": [
198+
"# Setup a labeling project"
199+
],
200+
"cell_type": "markdown"
201+
},
202+
{
203+
"metadata": {},
204+
"source": [
205+
"# Initialize the OntologyBuilder\n",
206+
"ontology_builder = OntologyBuilder()\n",
207+
"\n",
208+
"# Assuming 'annotations' is defined and contains the necessary data\n",
209+
"for category in annotations['categories']:\n",
210+
" print(category['name'])\n",
211+
" # Add tools to the ontology builder\n",
212+
" ontology_builder.add_tool(Tool(tool=Tool.Type.BBOX, name=category['name']))\n",
213+
"\n",
214+
"# Create the ontology in Labelbox\n",
215+
"ontology = client.create_ontology(\"Vessel Detection Ontology\",\n",
216+
" ontology_builder.asdict(),\n",
217+
" media_type=lb.MediaType.Image)\n",
218+
"print(f\"Created ontology with ID: {ontology.uid}\")\n",
219+
"\n",
220+
"# Create a project and set up the ontology\n",
221+
"project = client.create_project(name=\"Vessel Detection\", media_type=lb.MediaType.Image)\n",
222+
"project.setup_editor(ontology=ontology)\n",
223+
"print(f\"Created project with ID: {project.uid}\")"
224+
],
225+
"cell_type": "code",
226+
"outputs": [],
227+
"execution_count": null
228+
},
229+
{
230+
"metadata": {},
231+
"source": [
232+
"# Export data rows from the dataset\n",
233+
"data_rows = [dr.uid for dr in dataset.export_data_rows()]\n",
234+
"\n",
235+
"# Randomly select 200 Data Rows (or fewer if the dataset has less than 200 data rows)\n",
236+
"sampled_data_rows = random.sample(data_rows, min(len(data_rows), 200))\n",
237+
"\n",
238+
"# Create a new batch in the project and add the sampled data rows\n",
239+
"batch = project.create_batch(\n",
240+
" \"Initial batch\", # name of the batch\n",
241+
" sampled_data_rows, # list of Data Rows\n",
242+
" 1 # priority between 1-5\n",
243+
")\n",
244+
"print(f\"Created batch with ID: {batch.uid}\")"
245+
],
246+
"cell_type": "code",
247+
"outputs": [],
248+
"execution_count": null
249+
},
250+
{
251+
"metadata": {},
252+
"source": [
253+
"queued_data_rows = project.export_queued_data_rows()\n",
254+
"labels = []\n",
255+
"\n",
256+
"for datarow in queued_data_rows:\n",
257+
" annotations_list = []\n",
258+
" folder = datarow['externalId'].split(\"/\")[0]\n",
259+
" id = datarow['externalId'].split(\"/\")[1]\n",
260+
" if folder == \"positive_image_set\":\n",
261+
" for image in annotations['images']:\n",
262+
" if image['file_name'] == id:\n",
263+
" for annotation in annotations['annotations']:\n",
264+
" if annotation['image_id'] == image['id']:\n",
265+
" bbox = annotation['bbox']\n",
266+
" category_id = annotation['category_id'] - 1\n",
267+
" class_name = None\n",
268+
" ontology = ontology_builder.asdict() # Get the ontology dictionary\n",
269+
" for category in ontology['tools']:\n",
270+
" if category['name'] == annotations['categories'][category_id]['name']:\n",
271+
" class_name = category['name']\n",
272+
" break\n",
273+
" if class_name:\n",
274+
" annotations_list.append(ObjectAnnotation(\n",
275+
" name=class_name,\n",
276+
" value=Rectangle(start=Point(x=bbox[0], y=bbox[1]), end=Point(x=bbox[2]+bbox[0], y=bbox[3]+bbox[1]))\n",
277+
" ))\n",
278+
" image_data = ImageData(uid=datarow['id'])\n",
279+
" labels.append(Label(data=image_data, annotations=annotations_list))\n"
280+
],
281+
"cell_type": "code",
282+
"outputs": [],
283+
"execution_count": null
284+
},
285+
{
286+
"metadata": {},
287+
"source": [
288+
"print(labels)"
289+
],
290+
"cell_type": "code",
291+
"outputs": [],
292+
"execution_count": null
293+
},
294+
{
295+
"metadata": {},
296+
"source": [
297+
"upload_job = lb.LabelImport.create_from_objects(\n",
298+
" client=client,\n",
299+
" project_id=project.uid,\n",
300+
" name=f\"label_import_job_{str(uuid.uuid4())}\",\n",
301+
" labels=labels\n",
302+
")\n",
303+
"\n",
304+
"# Wait for the upload to finish and print the results\n",
305+
"upload_job.wait_until_done()\n",
306+
"\n",
307+
"print(f\"Errors: {upload_job.errors}\")\n",
308+
"print(f\"Status of uploads: {upload_job.statuses}\")"
309+
],
310+
"cell_type": "code",
311+
"outputs": [],
312+
"execution_count": null
313+
}
314+
]
315+
}

0 commit comments

Comments
 (0)