Skip to content

Commit 2b47469

Browse files
Merge pull request #602 from Labelbox/kkim/AL-2619
[AL-2619] Add 'metadata' field representing data row metadata as DataRowMetadat…
2 parents e6a19d4 + b0572ec commit 2b47469

File tree

7 files changed

+127
-52
lines changed

7 files changed

+127
-52
lines changed

labelbox/orm/db_object.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,15 @@ def _set_field_values(self, field_values):
7070
"field %s", value, field)
7171
elif isinstance(field.field_type, Field.EnumType):
7272
value = field.field_type.enum_cls(value)
73+
elif isinstance(field.field_type, Field.ListType):
74+
if field.field_type.list_cls.__name__ == "DataRowMetadataField":
75+
mdo = self.client.get_data_row_metadata_ontology()
76+
try:
77+
value = mdo.parse_metadata_fields(value)
78+
except ValueError:
79+
logger.warning(
80+
"Failed to convert value '%s' to metadata for field %s",
81+
value, field)
7382
setattr(self, field.name, value)
7483

7584
def __repr__(self):

labelbox/schema/batch.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,8 @@ def export_data_rows(self, timeout_seconds=120) -> Generator:
106106
reader = ndjson.reader(StringIO(response.text))
107107
# TODO: Update result to parse metadataFields when resolver returns
108108
return (Entity.DataRow(self.client, {
109-
**result, 'metadataFields': []
109+
**result, 'metadataFields': [],
110+
'customMetadata': []
110111
}) for result in reader)
111112
elif res["status"] == "FAILED":
112113
raise LabelboxError("Data row export failed.")

labelbox/schema/data_row.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,9 @@ class DataRow(DbObject, Updateable, BulkDeletable):
2121
Otherwise, it's treated as an external URL.
2222
updated_at (datetime)
2323
created_at (datetime)
24-
media_attributes (dict): generated media attributes for the datarow
25-
metadata_fields (list): metadata associated with the datarow
24+
media_attributes (dict): generated media attributes for the data row
25+
metadata_fields (list): metadata associated with the data row
26+
metadata (list): metadata associated with the data row as list of DataRowMetadataField
2627
2728
dataset (Relationship): `ToOne` relationship to Dataset
2829
created_by (Relationship): `ToOne` relationship to User
@@ -36,10 +37,14 @@ class DataRow(DbObject, Updateable, BulkDeletable):
3637
created_at = Field.DateTime("created_at")
3738
media_attributes = Field.Json("media_attributes")
3839
metadata_fields = Field.List(
39-
DataRowMetadataField,
40+
dict,
4041
graphql_type="DataRowCustomMetadataUpsertInput!",
4142
name="metadata_fields",
4243
result_subquery="metadataFields { schemaId name value kind }")
44+
metadata = Field.List(DataRowMetadataField,
45+
name="metadata",
46+
graphql_name="customMetadata",
47+
result_subquery="customMetadata { schemaId value }")
4348

4449
# Relationships
4550
dataset = Relationship.ToOne("Dataset")

labelbox/schema/data_row_metadata.py

Lines changed: 43 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
# type: ignore
22
from datetime import datetime
3-
import warnings
43
from copy import deepcopy
54
from enum import Enum
65
from itertools import chain
@@ -224,34 +223,53 @@ def parse_metadata(
224223

225224
for dr in unparsed:
226225
fields = []
227-
for f in dr["fields"]:
228-
if f["schemaId"] not in self.fields_by_id:
229-
# Update metadata ontology if field can't be found
230-
self.refresh_ontology()
231-
if f["schemaId"] not in self.fields_by_id:
232-
raise ValueError(
233-
f"Schema Id `{f['schemaId']}` not found in ontology"
234-
)
235-
236-
schema = self.fields_by_id[f["schemaId"]]
237-
if schema.kind == DataRowMetadataKind.enum:
238-
continue
239-
elif schema.kind == DataRowMetadataKind.option:
240-
field = DataRowMetadataField(schema_id=schema.parent,
241-
value=schema.uid)
242-
elif schema.kind == DataRowMetadataKind.datetime:
243-
field = DataRowMetadataField(
244-
schema_id=schema.uid,
245-
value=datetime.fromisoformat(f["value"][:-1] +
246-
"+00:00"))
247-
else:
248-
field = DataRowMetadataField(schema_id=schema.uid,
249-
value=f["value"])
250-
fields.append(field)
226+
if "fields" in dr:
227+
fields = self.parse_metadata_fields(dr["fields"])
251228
parsed.append(
252229
DataRowMetadata(data_row_id=dr["dataRowId"], fields=fields))
253230
return parsed
254231

232+
def parse_metadata_fields(
233+
self, unparsed: List[Dict[str,
234+
Dict]]) -> List[DataRowMetadataField]:
235+
""" Parse metadata fields as list of `DataRowMetadataField`
236+
237+
>>> mdo.parse_metadata_fields([metadata_fields])
238+
239+
Args:
240+
unparsed: An unparsed list of metadata represented as a dict containing 'schemaId' and 'value'
241+
242+
Returns:
243+
metadata: List of `DataRowMetadataField`
244+
"""
245+
parsed = []
246+
if isinstance(unparsed, dict):
247+
raise ValueError("Pass a list of dictionaries")
248+
249+
for f in unparsed:
250+
if f["schemaId"] not in self.fields_by_id:
251+
# Update metadata ontology if field can't be found
252+
self.refresh_ontology()
253+
if f["schemaId"] not in self.fields_by_id:
254+
raise ValueError(
255+
f"Schema Id `{f['schemaId']}` not found in ontology")
256+
257+
schema = self.fields_by_id[f["schemaId"]]
258+
if schema.kind == DataRowMetadataKind.enum:
259+
continue
260+
elif schema.kind == DataRowMetadataKind.option:
261+
field = DataRowMetadataField(schema_id=schema.parent,
262+
value=schema.uid)
263+
elif schema.kind == DataRowMetadataKind.datetime:
264+
field = DataRowMetadataField(
265+
schema_id=schema.uid,
266+
value=datetime.fromisoformat(f["value"][:-1] + "+00:00"))
267+
else:
268+
field = DataRowMetadataField(schema_id=schema.uid,
269+
value=f["value"])
270+
parsed.append(field)
271+
return parsed
272+
255273
def bulk_upsert(
256274
self, metadata: List[DataRowMetadata]
257275
) -> List[DataRowMetadataBatchResponse]:

labelbox/schema/dataset.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -442,7 +442,8 @@ def export_data_rows(self, timeout_seconds=120) -> Generator:
442442
reader = ndjson.reader(StringIO(response.text))
443443
# TODO: Update result to parse metadataFields when resolver returns
444444
return (Entity.DataRow(self.client, {
445-
**result, 'metadataFields': []
445+
**result, 'metadataFields': [],
446+
'customMetadata': []
446447
}) for result in reader)
447448
elif res["status"] == "FAILED":
448449
raise LabelboxError("Data row export failed.")

tests/integration/test_data_row_metadata.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import time
21
from datetime import datetime
32

43
import pytest
@@ -282,3 +281,34 @@ def test_parse_raw_metadata(mdo):
282281
for row in parsed:
283282
for field in row.fields:
284283
assert mdo._parse_upsert(field)
284+
285+
286+
def test_parse_raw_metadata_fields(mdo):
287+
example = [
288+
{
289+
'schemaId': 'cko8s9r5v0001h2dk9elqdidh',
290+
'value': 'my-new-message'
291+
},
292+
{
293+
'schemaId': 'cko8sbczn0002h2dkdaxb5kal',
294+
'value': {}
295+
},
296+
{
297+
'schemaId': 'cko8sbscr0003h2dk04w86hof',
298+
'value': {}
299+
},
300+
{
301+
'schemaId': 'cko8sdzv70006h2dk8jg64zvb',
302+
'value': '2021-07-20T21:41:14.606710Z'
303+
},
304+
{
305+
'schemaId': FAKE_SCHEMA_ID,
306+
'value': 0.5
307+
},
308+
]
309+
310+
parsed = mdo.parse_metadata_fields(example)
311+
assert len(parsed) == 4
312+
313+
for field in parsed:
314+
assert mdo._parse_upsert(field)

tests/integration/test_data_rows.py

Lines changed: 32 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
1-
import imghdr
21
from tempfile import NamedTemporaryFile
32
import uuid
4-
import time
53
from datetime import datetime
64

75
import pytest
@@ -22,6 +20,14 @@
2220
].sort()
2321

2422

23+
@pytest.fixture
24+
def mdo(client):
25+
mdo = client.get_data_row_metadata_ontology()
26+
mdo._raw_ontology = mdo._get_ontology()
27+
mdo._build_ontology()
28+
yield mdo
29+
30+
2531
def make_metadata_fields():
2632
embeddings = [0.0] * 128
2733
msg = "A message"
@@ -57,12 +63,6 @@ def make_metadata_fields_dict():
5763
return fields
5864

5965

60-
def filter_precomputed_embeddings(metadata_fields):
61-
return list(
62-
filter(lambda md: md["name"] != "precomputedImageEmbedding",
63-
metadata_fields))
64-
65-
6666
def test_get_data_row(datarow, client):
6767
assert client.get_data_row(datarow.uid)
6868

@@ -235,7 +235,7 @@ def test_create_data_row_with_invalid_input(dataset, image_url):
235235
dataset.create_data_row(dr, row_data=image_url)
236236

237237

238-
def test_create_data_row_with_metadata(dataset, image_url):
238+
def test_create_data_row_with_metadata(mdo, dataset, image_url):
239239
client = dataset.client
240240
assert len(list(dataset.data_rows())) == 0
241241

@@ -249,13 +249,17 @@ def test_create_data_row_with_metadata(dataset, image_url):
249249
assert requests.get(image_url).content == \
250250
requests.get(data_row.row_data).content
251251
assert data_row.media_attributes is not None
252-
filtered_md_fields = filter_precomputed_embeddings(data_row.metadata_fields)
253-
assert len(filtered_md_fields) == 4
254-
assert [m["schemaId"] for m in filtered_md_fields
252+
metadata_fields = data_row.metadata_fields
253+
metadata = data_row.metadata
254+
assert len(metadata_fields) == 4
255+
assert len(metadata) == 4
256+
assert [m["schemaId"] for m in metadata_fields
255257
].sort() == EXPECTED_METADATA_SCHEMA_IDS
258+
for m in metadata:
259+
assert mdo._parse_upsert(m)
256260

257261

258-
def test_create_data_row_with_metadata_dict(dataset, image_url):
262+
def test_create_data_row_with_metadata_dict(mdo, dataset, image_url):
259263
client = dataset.client
260264
assert len(list(dataset.data_rows())) == 0
261265

@@ -269,10 +273,14 @@ def test_create_data_row_with_metadata_dict(dataset, image_url):
269273
assert requests.get(image_url).content == \
270274
requests.get(data_row.row_data).content
271275
assert data_row.media_attributes is not None
272-
filtered_md_fields = filter_precomputed_embeddings(data_row.metadata_fields)
273-
assert len(filtered_md_fields) == 4
274-
assert [m["schemaId"] for m in filtered_md_fields
276+
metadata_fields = data_row.metadata_fields
277+
metadata = data_row.metadata
278+
assert len(metadata_fields) == 4
279+
assert len(metadata) == 4
280+
assert [m["schemaId"] for m in metadata_fields
275281
].sort() == EXPECTED_METADATA_SCHEMA_IDS
282+
for m in metadata:
283+
assert mdo._parse_upsert(m)
276284

277285

278286
def test_create_data_row_with_invalid_metadata(dataset, image_url):
@@ -284,7 +292,7 @@ def test_create_data_row_with_invalid_metadata(dataset, image_url):
284292
dataset.create_data_row(row_data=image_url, metadata_fields=fields)
285293

286294

287-
def test_create_data_rows_with_metadata(dataset, image_url):
295+
def test_create_data_rows_with_metadata(mdo, dataset, image_url):
288296
client = dataset.client
289297
assert len(list(dataset.data_rows())) == 0
290298

@@ -322,11 +330,14 @@ def test_create_data_rows_with_metadata(dataset, image_url):
322330
requests.get(row.row_data).content
323331
assert row.media_attributes is not None
324332

325-
# Remove 'precomputedImageEmbedding' metadata if automatically added
326-
filtered_md_fields = filter_precomputed_embeddings(row.metadata_fields)
327-
assert len(filtered_md_fields) == 4
328-
assert [m["schemaId"] for m in filtered_md_fields
333+
metadata_fields = row.metadata_fields
334+
metadata = row.metadata
335+
assert len(metadata_fields) == 4
336+
assert len(metadata) == 4
337+
assert [m["schemaId"] for m in metadata_fields
329338
].sort() == EXPECTED_METADATA_SCHEMA_IDS
339+
for m in metadata:
340+
assert mdo._parse_upsert(m)
330341

331342

332343
def test_create_data_rows_with_invalid_metadata(dataset, image_url):

0 commit comments

Comments
 (0)