Skip to content

Commit b20f5bf

Browse files
author
Matt Sokoloff
committed
update tests, prep for release
1 parent c9c66b7 commit b20f5bf

File tree

6 files changed

+160
-47
lines changed

6 files changed

+160
-47
lines changed

CHANGELOG.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,14 @@
11
# Changelog
22

3+
# Version 3.3.0 (2021-09-02)
4+
## Added
5+
* `Dataset.create_data_rows_sync()` for synchronous bulk uploads of data rows
6+
* `Model.delete()`, `ModelRun.delete()`, and `ModelRun.delete_annotation_groups()` to
7+
Clean up models, model runs, and annotation groups.
8+
9+
## Fix
10+
* Increased timeout for label exports since projects with many segmentation masks weren't finishing quickly enough.
11+
312
# Version 3.2.1 (2021-08-31)
413
## Fix
514
* Resolved issue with `create_data_rows()` was not working on amazon linux

labelbox/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
name = "labelbox"
2-
__version__ = "3.2.1"
2+
__version__ = "3.3.0"
33

44
from labelbox.schema.project import Project
55
from labelbox.client import Client

labelbox/schema/dataset.py

Lines changed: 113 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from labelbox.orm.model import Entity, Field, Relationship
1515

1616
logger = logging.getLogger(__name__)
17+
import time
1718

1819

1920
class Dataset(DbObject, Updateable, Deletable):
@@ -69,13 +70,111 @@ def create_data_row(self, **kwargs):
6970
row_data = kwargs[DataRow.row_data.name]
7071
if os.path.exists(row_data):
7172
kwargs[DataRow.row_data.name] = self.client.upload_file(row_data)
72-
7373
kwargs[DataRow.dataset.name] = self
74-
7574
return self.client._create(DataRow, kwargs)
7675

76+
def create_data_rows_sync(self, items):
77+
""" Synchronously bulk upload data rows.
78+
79+
Use this instead of `Dataset.create_data_rows` for smaller batches of data rows that need to be uploaded quickly.
80+
Cannot use this for uploads containing more than 1000 data rows.
81+
Each data row is also limited to 5 attachments.
82+
83+
Args:
84+
items (iterable of (dict or str)):
85+
See the docstring for `Dataset._create_descriptor_file` for more information.
86+
Returns:
87+
None. If the function doesn't raise an exception then the import was successful.
88+
89+
Raises:
90+
InvalidQueryError: If the `items` parameter does not conform to
91+
the specification in Dataset._create_descriptor_file or if the server did not accept the
92+
DataRow creation request (unknown reason).
93+
InvalidAttributeError: If there are fields in `items` not valid for
94+
a DataRow.
95+
ValueError: When the upload parameters are invalid
96+
"""
97+
max_data_rows_supported = 1000
98+
max_attachments_per_data_row = 5
99+
if len(items) > max_data_rows_supported:
100+
raise ValueError(
101+
f"Dataset.create_data_rows_sync() supports a max of {max_data_rows_supported} data rows."
102+
" For larger imports use the async function Dataset.create_data_rows()"
103+
)
104+
descriptor_url = self._create_descriptor_file(
105+
items, max_attachments_per_data_row=max_attachments_per_data_row)
106+
dataset_param = "datasetId"
107+
url_param = "jsonUrl"
108+
query_str = """mutation AppendRowsToDatasetSyncPyApi($%s: ID!, $%s: String!){
109+
appendRowsToDatasetSync(data:{datasetId: $%s, jsonFileUrl: $%s}
110+
){dataset{id}}} """ % (dataset_param, url_param, dataset_param,
111+
url_param)
112+
self.client.execute(query_str, {
113+
dataset_param: self.uid,
114+
url_param: descriptor_url
115+
})
116+
77117
def create_data_rows(self, items):
78-
""" Creates multiple DataRow objects based on the given `items`.
118+
""" Asynchronously bulk upload data rows
119+
120+
Use this instead of `Dataset.create_data_rows_sync` uploads for batches that contain more than 100 data rows.
121+
122+
Args:
123+
items (iterable of (dict or str)): See the docstring for `Dataset._create_descriptor_file` for more information
124+
125+
Returns:
126+
Task representing the data import on the server side. The Task
127+
can be used for inspecting task progress and waiting until it's done.
128+
129+
Raises:
130+
InvalidQueryError: If the `items` parameter does not conform to
131+
the specification above or if the server did not accept the
132+
DataRow creation request (unknown reason).
133+
ResourceNotFoundError: If unable to retrieve the Task for the
134+
import process. This could imply that the import failed.
135+
InvalidAttributeError: If there are fields in `items` not valid for
136+
a DataRow.
137+
ValueError: When the upload parameters are invalid
138+
"""
139+
descriptor_url = self._create_descriptor_file(items)
140+
# Create data source
141+
dataset_param = "datasetId"
142+
url_param = "jsonUrl"
143+
query_str = """mutation AppendRowsToDatasetPyApi($%s: ID!, $%s: String!){
144+
appendRowsToDataset(data:{datasetId: $%s, jsonFileUrl: $%s}
145+
){ taskId accepted errorMessage } } """ % (dataset_param, url_param,
146+
dataset_param, url_param)
147+
148+
res = self.client.execute(query_str, {
149+
dataset_param: self.uid,
150+
url_param: descriptor_url
151+
})
152+
res = res["appendRowsToDataset"]
153+
if not res["accepted"]:
154+
msg = res['errorMessage']
155+
raise InvalidQueryError(
156+
f"Server did not accept DataRow creation request. {msg}")
157+
158+
# Fetch and return the task.
159+
task_id = res["taskId"]
160+
user = self.client.get_user()
161+
task = list(user.created_tasks(where=Entity.Task.uid == task_id))
162+
# Cache user in a private variable as the relationship can't be
163+
# resolved due to server-side limitations (see Task.created_by)
164+
# for more info.
165+
if len(task) != 1:
166+
raise ResourceNotFoundError(Entity.Task, task_id)
167+
task = task[0]
168+
task._user = user
169+
return task
170+
171+
def _create_descriptor_file(self, items, max_attachments_per_data_row=None):
172+
"""
173+
This function is shared by both `Dataset.create_data_rows` and `Dataset.create_data_rows_sync`
174+
to prepare the input file. The user defined input is validated, processed, and json stringified.
175+
Finally the json data is uploaded to gcs and a uri is returned. This uri can be passed to
176+
177+
79178
80179
Each element in `items` can be either a `str` or a `dict`. If
81180
it is a `str`, then it is interpreted as a local file path. The file
@@ -102,19 +201,19 @@ def create_data_rows(self, items):
102201
103202
Args:
104203
items (iterable of (dict or str)): See above for details.
204+
max_attachments_per_data_row (Optional[int]): Param used during attachment validation to determine
205+
if the user has provided too many attachments.
105206
106207
Returns:
107-
Task representing the data import on the server side. The Task
108-
can be used for inspecting task progress and waiting until it's done.
208+
uri (string): A reference to the uploaded json data.
109209
110210
Raises:
111211
InvalidQueryError: If the `items` parameter does not conform to
112212
the specification above or if the server did not accept the
113213
DataRow creation request (unknown reason).
114-
ResourceNotFoundError: If unable to retrieve the Task for the
115-
import process. This could imply that the import failed.
116214
InvalidAttributeError: If there are fields in `items` not valid for
117215
a DataRow.
216+
ValueError: When the upload parameters are invalid
118217
"""
119218
file_upload_thread_count = 20
120219
DataRow = Entity.DataRow
@@ -135,6 +234,12 @@ def validate_attachments(item):
135234
attachments = item.get('attachments')
136235
if attachments:
137236
if isinstance(attachments, list):
237+
if max_attachments_per_data_row and len(
238+
attachments) > max_attachments_per_data_row:
239+
raise ValueError(
240+
f"Max attachments number of supported attachments per data row is {max_attachments_per_data_row}."
241+
f" Found {len(attachments)}. Condense multiple attachments into one with the HTML attachment type if necessary."
242+
)
138243
for attachment in attachments:
139244
AssetAttachment.validate_attachment_json(attachment)
140245
else:
@@ -198,40 +303,9 @@ def convert_item(item):
198303
with ThreadPoolExecutor(file_upload_thread_count) as executor:
199304
futures = [executor.submit(convert_item, item) for item in items]
200305
items = [future.result() for future in as_completed(futures)]
201-
202306
# Prepare and upload the desciptor file
203307
data = json.dumps(items)
204-
descriptor_url = self.client.upload_data(data)
205-
# Create data source
206-
dataset_param = "datasetId"
207-
url_param = "jsonUrl"
208-
query_str = """mutation AppendRowsToDatasetPyApi($%s: ID!, $%s: String!){
209-
appendRowsToDataset(data:{datasetId: $%s, jsonFileUrl: $%s}
210-
){ taskId accepted errorMessage } } """ % (dataset_param, url_param,
211-
dataset_param, url_param)
212-
213-
res = self.client.execute(query_str, {
214-
dataset_param: self.uid,
215-
url_param: descriptor_url
216-
})
217-
res = res["appendRowsToDataset"]
218-
if not res["accepted"]:
219-
msg = res['errorMessage']
220-
raise InvalidQueryError(
221-
f"Server did not accept DataRow creation request. {msg}")
222-
223-
# Fetch and return the task.
224-
task_id = res["taskId"]
225-
user = self.client.get_user()
226-
task = list(user.created_tasks(where=Entity.Task.uid == task_id))
227-
# Cache user in a private variable as the relationship can't be
228-
# resolved due to server-side limitations (see Task.created_by)
229-
# for more info.
230-
if len(task) != 1:
231-
raise ResourceNotFoundError(Entity.Task, task_id)
232-
task = task[0]
233-
task._user = user
234-
return task
308+
return self.client.upload_data(data)
235309

236310
def data_rows_for_external_id(self, external_id, limit=10):
237311
""" Convenience method for getting a single `DataRow` belonging to this

labelbox/schema/project.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ def export_queued_data_rows(self, timeout_seconds=120):
166166
self.uid)
167167
time.sleep(sleep_time)
168168

169-
def video_label_generator(self, timeout_seconds=120):
169+
def video_label_generator(self, timeout_seconds=600):
170170
"""
171171
Download video annotations
172172
@@ -190,7 +190,7 @@ def video_label_generator(self, timeout_seconds=120):
190190
"Or use project.label_generator() for text and imagery data.")
191191
return LBV1Converter.deserialize_video(json_data, self.client)
192192

193-
def label_generator(self, timeout_seconds=60):
193+
def label_generator(self, timeout_seconds=600):
194194
"""
195195
Download text and image annotations
196196
@@ -214,7 +214,7 @@ def label_generator(self, timeout_seconds=60):
214214
"Or use project.video_label_generator() for video data.")
215215
return LBV1Converter.deserialize(json_data)
216216

217-
def export_labels(self, download=False, timeout_seconds=60):
217+
def export_labels(self, download=False, timeout_seconds=600):
218218
""" Calls the server-side Label exporting that generates a JSON
219219
payload, and returns the URL to that payload.
220220

labelbox/schema/task.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def refresh(self):
4040
for field in self.fields():
4141
setattr(self, field.name, getattr(tasks[0], field.name))
4242

43-
def wait_till_done(self, timeout_seconds=60):
43+
def wait_till_done(self, timeout_seconds=300):
4444
""" Waits until the task is completed. Periodically queries the server
4545
to update the task attributes.
4646

tests/integration/test_data_rows.py

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -66,16 +66,18 @@ def test_data_row_bulk_creation(dataset, rand_gen, image_url):
6666
@pytest.mark.slow
6767
def test_data_row_large_bulk_creation(dataset, image_url):
6868
# Do a longer task and expect it not to be complete immediately
69+
n_local = 2000
70+
n_urls = 250
6971
with NamedTemporaryFile() as fp:
7072
fp.write("Test data".encode())
7173
fp.flush()
7274
task = dataset.create_data_rows([{
7375
DataRow.row_data: image_url
74-
}] * 750 + [fp.name] * 250)
76+
}] * n_local + [fp.name] * n_urls)
7577
assert task.status == "IN_PROGRESS"
76-
task.wait_till_done(timeout_seconds=120)
78+
task.wait_till_done()
7779
assert task.status == "COMPLETE"
78-
assert len(list(dataset.data_rows())) == 1000
80+
assert len(list(dataset.data_rows())) == n_local + n_urls
7981

8082

8183
@pytest.mark.xfail(reason="DataRow.dataset() relationship not set")
@@ -210,3 +212,31 @@ def test_data_row_attachments(dataset, image_url):
210212
"value": "123"
211213
}]
212214
}])
215+
216+
217+
def test_create_data_rows_sync_attachments(dataset, image_url):
218+
attachments = [("IMAGE", image_url), ("TEXT", "test-text"),
219+
("IMAGE_OVERLAY", image_url), ("HTML", image_url)]
220+
dataset.create_data_rows_sync([{
221+
"row_data": image_url,
222+
"external_id": "test-id",
223+
"attachments": [{
224+
"type": attachment_type,
225+
"value": attachment_value
226+
}]
227+
} for attachment_type, attachment_value in attachments])
228+
data_rows = list(dataset.data_rows())
229+
assert len(data_rows) == len(attachments)
230+
assert list(len(data_rows[0].attachments())) == len(attachments)
231+
232+
233+
def test_create_data_rows_sync_mixed_upload(dataset, image_url):
234+
n_local = 100
235+
n_urls = 100
236+
with NamedTemporaryFile() as fp:
237+
fp.write("Test data".encode())
238+
fp.flush()
239+
task = dataset.create_data_rows([{
240+
DataRow.row_data: image_url
241+
}] * n_urls + [fp.name] * n_local)
242+
assert len(list(dataset.data_rows())) == n_local + n_urls

0 commit comments

Comments
 (0)