1515
1616class LabelCollection :
1717 """
18- A container for
19-
18+ A container for interacting with a collection of labels.
19+ Less memory efficient than LabelGenerator but more performant and convenient to use.
20+ Use on smaller datasets.
2021 """
2122
2223 def __init__ (self , data : Iterable [Label ]):
2324 self ._data = data
2425 self ._index = 0
2526
26- def __iter__ (self ):
27- self ._index = 0
28- return self
29-
30- def __next__ (self ) -> Label :
31- if self ._index == len (self ._data ):
32- raise StopIteration
33-
34- value = self ._data [self ._index ]
35- self ._index += 1
36- return value
37-
38- def __len__ (self ) -> int :
39- return len (self ._data )
40-
41- def __getitem__ (self , idx : int ) -> Label :
42- return self ._data [idx ]
43-
4427 def assign_schema_ids (
4528 self , ontology_builder : OntologyBuilder ) -> "LabelCollection" :
4629 """
47- Based on an ontology:
48- - Checks to make sure that the feature names exist in the ontology
49- - Updates the names to match the ontology.
30+ Adds schema ids to all FeatureSchema objects in the Labels.
31+ This is necessary for MAL.
32+
33+ Args:
34+ ontology_builder: The ontology that matches the feature names assigned to objects in this LabelCollection
35+ Returns:
36+ LabelCollection. useful for chaining these modifying functions
5037 """
5138 for label in self ._data :
5239 label .assign_schema_ids (ontology_builder )
5340 return self
5441
55- def _ensure_unique_external_ids (self ) -> None :
56- external_ids = set ()
57- for label in self ._data :
58- if label .data .external_id is None :
59- label .data .external_id = uuid4 ()
60- else :
61- if label .data .external_id in external_ids :
62- raise ValueError (
63- f"External ids must be unique for bulk uploading. Found { label .data .external_id } more than once."
64- )
65- external_ids .add (label .data .external_id )
66-
6742 def add_to_dataset (self ,
68- dataset ,
69- signer ,
43+ dataset : "Entity.Dataset" ,
44+ signer : Callable [[ bytes ], str ] ,
7045 max_concurrency = 20 ) -> "LabelCollection" :
7146 """
72- # It is reccomended to create a new dataset if memory is a concern
73- # Also note that this relies on exported data that it cached.
74- # So this will not work on the same dataset more frequently than every 30 min.
75- # The workaround is creating a new dataset
47+ Creates data rows from each labels data object and attaches the data to the given dataset.
48+ Updates the label's data object to have the same external_id and uid as the data row.
49+ It is reccomended to create a new dataset if memory is a concern because all dataset data rows are exported to make this faster.
50+ Also note that this relies on exported data that it cached.
51+ So this will not work on the same dataset more frequently than every 30 min.
52+ The workaround is creating a new dataset each time this function is used.
53+
54+ Args:
55+ dataset: labelbox dataset object to add the new data row to
56+ signer: A function that accepts bytes and returns a signed url.
57+ Returns:
58+ LabelCollection with updated references to new data rows
7659 """
7760 self ._ensure_unique_external_ids ()
7861 self .add_url_to_data (signer , max_concurrency = max_concurrency )
@@ -92,8 +75,16 @@ def add_to_dataset(self,
9275
9376 def add_url_to_masks (self , signer , max_concurrency = 20 ) -> "LabelCollection" :
9477 """
95- Creates a data row id for each data row that needs it. If the data row exists then it skips the row.
96- TODO: Add error handling..
78+ Creates signed urls for all masks in the LabelCollection.
79+ Multiple masks can reference the same RasterData mask so this makes sure we only upload that url once.
80+ Only uploads url if one doesn't already exist.
81+
82+ Args:
83+ signer: A function that accepts bytes and returns a signed url.
84+ max_concurrency: how many threads to use for uploading.
85+ Should be balanced to match the signing services capabilities.
86+ Returns:
87+ LabelCollection with updated references to the new mask urls
9788 """
9889 for row in self ._apply_threaded (
9990 [label .add_url_to_masks for label in self ._data ], max_concurrency ,
@@ -103,14 +94,52 @@ def add_url_to_masks(self, signer, max_concurrency=20) -> "LabelCollection":
10394
10495 def add_url_to_data (self , signer , max_concurrency = 20 ) -> "LabelCollection" :
10596 """
106- TODO: Add error handling..
97+ Creates signed urls for the data
98+ Only uploads url if one doesn't already exist.
99+
100+ Args:
101+ signer: A function that accepts bytes and returns a signed url.
102+ max_concurrency: how many threads to use for uploading.
103+ Should be balanced to match the signing services capabilities.
104+ Returns:
105+ LabelCollection with updated references to the new data urls
107106 """
108107 for row in self ._apply_threaded (
109108 [label .add_url_to_data for label in self ._data ], max_concurrency ,
110109 signer ):
111110 ...
112111 return self
113112
113+ def _ensure_unique_external_ids (self ) -> None :
114+ external_ids = set ()
115+ for label in self ._data :
116+ if label .data .external_id is None :
117+ label .data .external_id = uuid4 ()
118+ else :
119+ if label .data .external_id in external_ids :
120+ raise ValueError (
121+ f"External ids must be unique for bulk uploading. Found { label .data .external_id } more than once."
122+ )
123+ external_ids .add (label .data .external_id )
124+
125+ def __iter__ (self ) -> "LabelCollection" :
126+ self ._index = 0
127+ return self
128+
129+ def __next__ (self ) -> Label :
130+ if self ._index == len (self ._data ):
131+ raise StopIteration
132+
133+ value = self ._data [self ._index ]
134+ self ._index += 1
135+ return value
136+
137+ def __len__ (self ) -> int :
138+ return len (self ._data )
139+
140+ def __getitem__ (self , idx : int ) -> Label :
141+ return self ._data [idx ]
142+
114143 def _apply_threaded (self , fns , max_concurrency , * args ):
115144 futures = []
116145 with ThreadPoolExecutor (max_workers = max_concurrency ) as executor :
@@ -122,6 +151,8 @@ def _apply_threaded(self, fns, max_concurrency, *args):
122151
123152class LabelGenerator (PrefetchGenerator ):
124153 """
154+ A container for interacting with a collection of labels.
155+
125156 Use this class if you have larger data. It is slightly harder to work with
126157 than the LabelCollection but will be much more memory efficient.
127158 """
@@ -130,14 +161,6 @@ def __init__(self, data: Generator[Label, None, None], *args, **kwargs):
130161 self ._fns = {}
131162 super ().__init__ (data , * args , ** kwargs )
132163
133- def __iter__ (self ):
134- return self
135-
136- def process (self , value ):
137- for fn in self ._fns .values ():
138- value = fn (value )
139- return value
140-
141164 def as_collection (self ) -> "LabelCollection" :
142165 return LabelCollection (data = list (self ))
143166
@@ -154,8 +177,13 @@ def _assign_ids(label: Label):
154177 def add_url_to_data (self , signer : Callable [[bytes ],
155178 str ]) -> "LabelGenerator" :
156179 """
157- Updates masks to have `url` attribute
158- Doesn't update masks that already have urls
180+ Creates signed urls for the data
181+ Only uploads url if one doesn't already exist.
182+
183+ Args:
184+ signer: A function that accepts bytes and returns a signed url.
185+ Returns:
186+ LabelGenerator that signs urls as data is accessed
159187 """
160188
161189 def _add_url_to_data (label : Label ):
@@ -165,8 +193,20 @@ def _add_url_to_data(label: Label):
165193 self ._fns ['_add_url_to_data' ] = _add_url_to_data
166194 return self
167195
168- def add_to_dataset (self , dataset ,
196+ def add_to_dataset (self , dataset : "Entity.Dataset" ,
169197 signer : Callable [[bytes ], str ]) -> "LabelGenerator" :
198+ """
199+ Creates data rows from each labels data object and attaches the data to the given dataset.
200+ Updates the label's data object to have the same external_id and uid as the data row.
201+
202+ This is a lot slower than LabelCollection.add_to_dataset but also more memory efficient.
203+
204+ Args:
205+ dataset: labelbox dataset object to add the new data row to
206+ signer: A function that accepts bytes and returns a signed url.
207+ Returns:
208+ LabelGenerator that updates references to the new data rows as data is accessed
209+ """
170210
171211 def _add_to_dataset (label : Label ):
172212 label .create_data_row (dataset , signer )
@@ -178,8 +218,16 @@ def _add_to_dataset(label: Label):
178218 def add_url_to_masks (self , signer : Callable [[bytes ],
179219 str ]) -> "LabelGenerator" :
180220 """
181- Updates masks to have `url` attribute
182- Doesn't update masks that already have urls
221+ Creates signed urls for all masks in the LabelGenerator.
222+ Multiple masks can reference the same RasterData mask so this makes sure we only upload that url once.
223+ Only uploads url if one doesn't already exist.
224+
225+ Args:
226+ signer: A function that accepts bytes and returns a signed url.
227+ max_concurrency: how many threads to use for uploading.
228+ Should be balanced to match the signing services capabilities.
229+ Returns:
230+ LabelGenerator that updates references to the new mask urls as data is accessed
183231 """
184232
185233 def _add_url_to_masks (label : Label ):
@@ -189,14 +237,22 @@ def _add_url_to_masks(label: Label):
189237 self ._fns ['add_url_to_masks' ] = _add_url_to_masks
190238 return self
191239
240+ def __iter__ (self ):
241+ return self
242+
243+ def _process (self , value ):
244+ for fn in self ._fns .values ():
245+ value = fn (value )
246+ return value
247+
192248 def __next__ (self ):
193249 """
194- - Double check that all values have been set.
195- - Items could have been processed before any of these modifying functions are called.
196- - None of these functions do anything if run more than once so the cost is minimal.
250+ Double checks that all values have been set.
251+ Items could have been processed before any of these modifying functions are called.
252+ None of these functions do anything if run more than once so the cost is minimal.
197253 """
198254 value = super ().__next__ ()
199- return self .process (value )
255+ return self ._process (value )
200256
201257
202258LabelData = Union [LabelCollection , LabelGenerator ]
0 commit comments