Skip to content

Commit 6167d2a

Browse files
author
Matt Sokoloff
committed
add annotation types and tests
1 parent 4344d8e commit 6167d2a

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

57 files changed

+2252
-1
lines changed

Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
FROM python:3.7
22

33
RUN pip install pytest pytest-cases
4+
RUN apt-get -y update && apt install -y libsm6 libxext6 ffmpeg libfontconfig1 libxrender1 libgl1-mesa-glx
45

56
WORKDIR /usr/src/labelbox
67
COPY requirements.txt /usr/src/labelbox

labelbox/data/__init__.py

Whitespace-only changes.

labelbox/data/annotation_types/__init__.py

Whitespace-only changes.
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
from typing import Any, Dict, List, Union
2+
3+
from .classification import Checklist, Dropdown, Radio, Text
4+
from .feature import FeatureSchema
5+
from .geometry import Geometry
6+
from .ner import TextEntity
7+
8+
9+
class BaseAnnotation(FeatureSchema):
10+
""" Base annotation class. Shouldn't be directly instantiated
11+
"""
12+
classifications: List["ClassificationAnnotation"] = []
13+
extra: Dict[str, Any] = {}
14+
15+
16+
class ObjectAnnotation(BaseAnnotation):
17+
"""Class representing objects annotations (non classifications or annotations that have a location)
18+
"""
19+
value: Union[TextEntity, Geometry]
20+
21+
22+
class ClassificationAnnotation(BaseAnnotation):
23+
"""Class represneting classification annotations (annotations that don't have a location) """
24+
value: Union[Text, Checklist, Radio, Dropdown]
25+
26+
27+
ClassificationAnnotation.update_forward_refs()
28+
29+
30+
class VideoObjectAnnotation(ObjectAnnotation):
31+
"""
32+
Class for video objects annotations
33+
34+
Args:
35+
frame: The frame index that this annotation corresponds to
36+
keyframe: Whether or not this annotation was a human generated or interpolated annotation
37+
"""
38+
frame: int
39+
keyframe: bool
40+
41+
42+
class VideoClassificationAnnotation(ClassificationAnnotation):
43+
"""
44+
Class for video classification annotations
45+
46+
Args:
47+
frame: The frame index that this annotation corresponds to
48+
"""
49+
frame: int
50+
51+
52+
AnnotationType = Union[ClassificationAnnotation, ObjectAnnotation]
53+
VideoAnnotationType = Union[VideoObjectAnnotation,
54+
VideoClassificationAnnotation]
55+
56+
VideoObjectAnnotation.update_forward_refs()
57+
ObjectAnnotation.update_forward_refs()
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from .classification import (Checklist, ClassificationAnswer, Dropdown, Radio,
2+
Text)
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
from typing import Any, Dict, List
2+
3+
from pydantic.main import BaseModel
4+
5+
from ..feature import FeatureSchema
6+
7+
8+
class ClassificationAnswer(FeatureSchema):
9+
"""
10+
- Represents a classification option.
11+
- Because it inherits from FeatureSchema
12+
the option can be represented with either the name or schema_id
13+
"""
14+
extra: Dict[str, Any] = {}
15+
16+
17+
class Radio(BaseModel):
18+
""" A classification with only one selected option allowed """
19+
answer: ClassificationAnswer
20+
21+
22+
class Checklist(BaseModel):
23+
""" A classification with many selected options allowed """
24+
answer: List[ClassificationAnswer]
25+
26+
27+
class Text(BaseModel):
28+
""" Free form text """
29+
answer: str
30+
31+
32+
class Dropdown(BaseModel):
33+
"""
34+
- A classification with many selected options allowed .
35+
- This is not currently compatible with MAL.
36+
"""
37+
answer: List[ClassificationAnswer]
Lines changed: 258 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,258 @@
1+
import logging
2+
from concurrent.futures import ThreadPoolExecutor, as_completed
3+
from typing import Callable, Generator, Iterable, Union
4+
from uuid import uuid4
5+
6+
from tqdm import tqdm
7+
8+
from labelbox.schema.ontology import OntologyBuilder
9+
from labelbox.orm.model import Entity
10+
from ..generator import PrefetchGenerator
11+
from .label import Label
12+
13+
logger = logging.getLogger(__name__)
14+
15+
16+
class LabelCollection:
17+
"""
18+
A container for interacting with a collection of labels.
19+
Less memory efficient than LabelGenerator but more performant and convenient to use.
20+
Use on smaller datasets.
21+
"""
22+
23+
def __init__(self, data: Iterable[Label]):
24+
self._data = data
25+
self._index = 0
26+
27+
def assign_schema_ids(
28+
self, ontology_builder: OntologyBuilder) -> "LabelCollection":
29+
"""
30+
Adds schema ids to all FeatureSchema objects in the Labels.
31+
This is necessary for MAL.
32+
33+
Args:
34+
ontology_builder: The ontology that matches the feature names assigned to objects in this LabelCollection
35+
Returns:
36+
LabelCollection. useful for chaining these modifying functions
37+
"""
38+
for label in self._data:
39+
label.assign_schema_ids(ontology_builder)
40+
return self
41+
42+
def add_to_dataset(self,
43+
dataset: "Entity.Dataset",
44+
signer: Callable[[bytes], str],
45+
max_concurrency=20) -> "LabelCollection":
46+
"""
47+
Creates data rows from each labels data object and attaches the data to the given dataset.
48+
Updates the label's data object to have the same external_id and uid as the data row.
49+
It is reccomended to create a new dataset if memory is a concern because all dataset data rows are exported to make this faster.
50+
Also note that this relies on exported data that it cached.
51+
So this will not work on the same dataset more frequently than every 30 min.
52+
The workaround is creating a new dataset each time this function is used.
53+
54+
Args:
55+
dataset: labelbox dataset object to add the new data row to
56+
signer: A function that accepts bytes and returns a signed url.
57+
Returns:
58+
LabelCollection with updated references to new data rows
59+
"""
60+
self._ensure_unique_external_ids()
61+
self.add_url_to_data(signer, max_concurrency=max_concurrency)
62+
upload_task = dataset.create_data_rows([{
63+
Entity.DataRow.row_data: label.data.url,
64+
Entity.DataRow.external_id: label.data.external_id
65+
} for label in self._data])
66+
upload_task.wait_til_done()
67+
68+
data_row_lookup = {
69+
data_row.external_id: data_row.uid
70+
for data_row in dataset.export_data_rows()
71+
}
72+
for label in self._data:
73+
label.data.uid = data_row_lookup[label.data.external_id]
74+
return self
75+
76+
def add_url_to_masks(self, signer, max_concurrency=20) -> "LabelCollection":
77+
"""
78+
Creates signed urls for all masks in the LabelCollection.
79+
Multiple masks can reference the same RasterData mask so this makes sure we only upload that url once.
80+
Only uploads url if one doesn't already exist.
81+
82+
Args:
83+
signer: A function that accepts bytes and returns a signed url.
84+
max_concurrency: how many threads to use for uploading.
85+
Should be balanced to match the signing services capabilities.
86+
Returns:
87+
LabelCollection with updated references to the new mask urls
88+
"""
89+
for row in self._apply_threaded(
90+
[label.add_url_to_masks for label in self._data], max_concurrency,
91+
signer):
92+
...
93+
return self
94+
95+
def add_url_to_data(self, signer, max_concurrency=20) -> "LabelCollection":
96+
"""
97+
Creates signed urls for the data
98+
Only uploads url if one doesn't already exist.
99+
100+
Args:
101+
signer: A function that accepts bytes and returns a signed url.
102+
max_concurrency: how many threads to use for uploading.
103+
Should be balanced to match the signing services capabilities.
104+
Returns:
105+
LabelCollection with updated references to the new data urls
106+
"""
107+
for row in self._apply_threaded(
108+
[label.add_url_to_data for label in self._data], max_concurrency,
109+
signer):
110+
...
111+
return self
112+
113+
def _ensure_unique_external_ids(self) -> None:
114+
external_ids = set()
115+
for label in self._data:
116+
if label.data.external_id is None:
117+
label.data.external_id = uuid4()
118+
else:
119+
if label.data.external_id in external_ids:
120+
raise ValueError(
121+
f"External ids must be unique for bulk uploading. Found {label.data.external_id} more than once."
122+
)
123+
external_ids.add(label.data.external_id)
124+
125+
def __iter__(self) -> "LabelCollection":
126+
self._index = 0
127+
return self
128+
129+
def __next__(self) -> Label:
130+
if self._index == len(self._data):
131+
raise StopIteration
132+
133+
value = self._data[self._index]
134+
self._index += 1
135+
return value
136+
137+
def __len__(self) -> int:
138+
return len(self._data)
139+
140+
def __getitem__(self, idx: int) -> Label:
141+
return self._data[idx]
142+
143+
def _apply_threaded(self, fns, max_concurrency, *args):
144+
futures = []
145+
with ThreadPoolExecutor(max_workers=max_concurrency) as executor:
146+
for fn in fns:
147+
futures.append(executor.submit(fn, *args))
148+
for future in tqdm(as_completed(futures)):
149+
yield future.result()
150+
151+
152+
class LabelGenerator(PrefetchGenerator):
153+
"""
154+
A container for interacting with a collection of labels.
155+
156+
Use this class if you have larger data. It is slightly harder to work with
157+
than the LabelCollection but will be much more memory efficient.
158+
"""
159+
160+
def __init__(self, data: Generator[Label, None, None], *args, **kwargs):
161+
self._fns = {}
162+
super().__init__(data, *args, **kwargs)
163+
164+
def as_collection(self) -> "LabelCollection":
165+
return LabelCollection(data=list(self))
166+
167+
def assign_schema_ids(
168+
self, ontology_builder: OntologyBuilder) -> "LabelGenerator":
169+
170+
def _assign_ids(label: Label):
171+
label.assign_schema_ids(ontology_builder)
172+
return label
173+
174+
self._fns['assign_schema_ids'] = _assign_ids
175+
return self
176+
177+
def add_url_to_data(self, signer: Callable[[bytes],
178+
str]) -> "LabelGenerator":
179+
"""
180+
Creates signed urls for the data
181+
Only uploads url if one doesn't already exist.
182+
183+
Args:
184+
signer: A function that accepts bytes and returns a signed url.
185+
Returns:
186+
LabelGenerator that signs urls as data is accessed
187+
"""
188+
189+
def _add_url_to_data(label: Label):
190+
label.add_url_to_data(signer)
191+
return label
192+
193+
self._fns['_add_url_to_data'] = _add_url_to_data
194+
return self
195+
196+
def add_to_dataset(self, dataset: "Entity.Dataset",
197+
signer: Callable[[bytes], str]) -> "LabelGenerator":
198+
"""
199+
Creates data rows from each labels data object and attaches the data to the given dataset.
200+
Updates the label's data object to have the same external_id and uid as the data row.
201+
202+
This is a lot slower than LabelCollection.add_to_dataset but also more memory efficient.
203+
204+
Args:
205+
dataset: labelbox dataset object to add the new data row to
206+
signer: A function that accepts bytes and returns a signed url.
207+
Returns:
208+
LabelGenerator that updates references to the new data rows as data is accessed
209+
"""
210+
211+
def _add_to_dataset(label: Label):
212+
label.create_data_row(dataset, signer)
213+
return label
214+
215+
self._fns['assign_datarow_ids'] = _add_to_dataset
216+
return self
217+
218+
def add_url_to_masks(self, signer: Callable[[bytes],
219+
str]) -> "LabelGenerator":
220+
"""
221+
Creates signed urls for all masks in the LabelGenerator.
222+
Multiple masks can reference the same RasterData mask so this makes sure we only upload that url once.
223+
Only uploads url if one doesn't already exist.
224+
225+
Args:
226+
signer: A function that accepts bytes and returns a signed url.
227+
max_concurrency: how many threads to use for uploading.
228+
Should be balanced to match the signing services capabilities.
229+
Returns:
230+
LabelGenerator that updates references to the new mask urls as data is accessed
231+
"""
232+
233+
def _add_url_to_masks(label: Label):
234+
label.add_url_to_masks(signer)
235+
return label
236+
237+
self._fns['add_url_to_masks'] = _add_url_to_masks
238+
return self
239+
240+
def __iter__(self):
241+
return self
242+
243+
def _process(self, value):
244+
for fn in self._fns.values():
245+
value = fn(value)
246+
return value
247+
248+
def __next__(self):
249+
"""
250+
Double checks that all values have been set.
251+
Items could have been processed before any of these modifying functions are called.
252+
None of these functions do anything if run more than once so the cost is minimal.
253+
"""
254+
value = super().__next__()
255+
return self._process(value)
256+
257+
258+
LabelData = Union[LabelCollection, LabelGenerator]
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from .raster import RasterData
2+
from .text import TextData
3+
from .video import VideoData
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
from typing import Optional
2+
3+
from pydantic import BaseModel
4+
5+
6+
class BaseData(BaseModel):
7+
"""
8+
Base class for objects representing data.
9+
This class shouldn't directly be used
10+
"""
11+
external_id: Optional[str] = None
12+
uid: Optional[str] = None

0 commit comments

Comments
 (0)