Labelbox
diff --git a/‎labelbox/data/annotation_types/classification/classification.py‎
Lines changed: 2 additions & 6 deletions b/‎labelbox/data/annotation_types/classification/classification.py‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎labelbox/data/annotation_types/collection.py‎
Lines changed: 56 additions & 25 deletions b/‎labelbox/data/annotation_types/collection.py‎
Lines changed: 56 additions & 25 deletions
diff --git a/‎labelbox/data/annotation_types/data/raster.py‎
Lines changed: 1 addition & 0 deletions b/‎labelbox/data/annotation_types/data/raster.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎labelbox/data/annotation_types/data/text.py‎
Lines changed: 2 additions & 3 deletions b/‎labelbox/data/annotation_types/data/text.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎labelbox/data/annotation_types/data/video.py‎
Lines changed: 5 additions & 6 deletions b/‎labelbox/data/annotation_types/data/video.py‎
Lines changed: 5 additions & 6 deletions
diff --git a/‎labelbox/data/annotation_types/geometry/rectangle.py‎
Lines changed: 2 additions & 3 deletions b/‎labelbox/data/annotation_types/geometry/rectangle.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎labelbox/data/annotation_types/label.py‎
Lines changed: 15 additions & 13 deletions b/‎labelbox/data/annotation_types/label.py‎
Lines changed: 15 additions & 13 deletions
@@ -1,9 +1,7 @@
-from typing import Any, Dict, List, Union, ForwardRef
-from pydantic.class_validators import validator
-
-from pydantic.main import BaseModel
+from typing import Any, Dict, List
 
 from labelbox.data.annotation_types.reference import FeatureSchemaRef
+from pydantic.main import BaseModel
 
 
 class ClassificationAnswer(FeatureSchemaRef):
@@ -16,7 +14,6 @@ class Radio(BaseModel):
 
 class CheckList(BaseModel):
     answer: List[ClassificationAnswer]
-    # TODO: Validate that there is only one of each answer
 
 
 class Text(BaseModel):
@@ -25,4 +22,3 @@ class Text(BaseModel):
 
 class Dropdown(BaseModel):
     answer: List[ClassificationAnswer]
-    # TODO: Validate that there is only one of each answer
@@ -1,18 +1,23 @@
+import logging
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import Callable, Generator, Iterable, Union
 from uuid import uuid4
 
 from labelbox.data.annotation_types.label import Label
+from labelbox.data.generator import PrefetchGenerator
 from labelbox.orm.model import Entity
 from labelbox.schema.ontology import OntologyBuilder
 from tqdm import tqdm
 
+logger = logging.getLogger(__name__)
+
 
 class LabelCollection:
     """
     A container for
 
     """
+
     def __init__(self, data: Iterable[Label]):
         self._data = data
         self._index = 0
@@ -35,7 +40,8 @@ def __len__(self) -> int:
     def __getitem__(self, idx: int) -> Label:
         return self._data[idx]
 
-    def assign_schema_ids(self, ontology_builder: OntologyBuilder) -> "LabelCollection":
+    def assign_schema_ids(
+            self, ontology_builder: OntologyBuilder) -> "LabelCollection":
         """
         Based on an ontology:
             - Checks to make sure that the feature names exist in the ontology
@@ -57,19 +63,22 @@ def _ensure_unique_external_ids(self) -> None:
                     )
             external_ids.add(label.data.external_id)
 
-    def add_to_dataset(self, dataset, signer, max_concurrency=20) -> "LabelCollection":
+    def add_to_dataset(self,
+                       dataset,
+                       signer,
+                       max_concurrency=20) -> "LabelCollection":
         """
         # It is reccomended to create a new dataset if memory is a concern
         # Also note that this relies on exported data that it cached.
         # So this will not work on the same dataset more frequently than every 30 min.
         # The workaround is creating a new dataset
         """
         self._ensure_unique_external_ids()
-        self.add_urls_to_data(signer,
-                                           max_concurrency=max_concurrency)
-        upload_task = dataset.create_data_rows(
-            [{Entity.DataRow.row_data: label.data.url, Entity.DataRow.external_id: label.data.external_id} for label in self._data]
-        )
+        self.add_urls_to_data(signer, max_concurrency=max_concurrency)
+        upload_task = dataset.create_data_rows([{
+            Entity.DataRow.row_data: label.data.url,
+            Entity.DataRow.external_id: label.data.external_id
+        } for label in self._data])
         upload_task.wait_til_done()
 
         data_row_lookup = {
@@ -80,20 +89,26 @@ def add_to_dataset(self, dataset, signer, max_concurrency=20) -> "LabelCollectio
             label.data.uid = data_row_lookup[label.data.external_id]
         return self
 
-    def add_urls_to_masks(self, signer, max_concurrency=20) -> "LabelCollection":
+    def add_urls_to_masks(self,
+                          signer,
+                          max_concurrency=20) -> "LabelCollection":
         """
         Creates a data row id for each data row that needs it. If the data row exists then it skips the row.
         TODO: Add error handling..
         """
-        for row in self._apply_threaded([label.add_url_to_masks for label in self._data], max_concurrency, signer):
+        for row in self._apply_threaded(
+            [label.add_url_to_masks for label in self._data], max_concurrency,
+                signer):
             ...
         return self
 
     def add_urls_to_data(self, signer, max_concurrency=20) -> "LabelCollection":
         """
         TODO: Add error handling..
         """
-        for row in self._apply_threaded([label.add_url_to_data for label in self._data], max_concurrency, signer):
+        for row in self._apply_threaded(
+            [label.add_url_to_data for label in self._data], max_concurrency,
+                signer):
             ...
         return self
 
@@ -105,68 +120,84 @@ def _apply_threaded(self, fns, max_concurrency, *args):
             for future in tqdm(as_completed(futures)):
                 yield future.result()
 
-class LabelGenerator:
+
+class LabelGenerator(PrefetchGenerator):
     """
     Use this class if you have larger data. It is slightly harder to work with
     than the LabelCollection but will be much more memory efficient.
     """
-    def __init__(self, data: Generator[Label, None,None]):
-        if isinstance(data, (list, tuple)):
-            self._data = (r for r in data)
-        else:
-            self._data = data
+
+    def __init__(self, data: Generator[Label, None, None], *args, **kwargs):
         self._fns = {}
+        super().__init__(data, *args, **kwargs)
 
     def __iter__(self):
         return self
 
-    def __next__(self) -> Label:
-        # Maybe some sort of prefetching could be nice
-        # to make things faster if users are applying io functions
-        value = next(self._data)
+    def process(self, value):
         for fn in self._fns.values():
             value = fn(value)
         return value
 
     def as_collection(self) -> "LabelCollection":
-        return LabelCollection(data = list(self._data))
+        return LabelCollection(data=list(self))
+
+    def assign_schema_ids(
+            self, ontology_builder: OntologyBuilder) -> "LabelGenerator":
 
-    def assign_schema_ids(self, ontology_builder: OntologyBuilder) -> "LabelGenerator":
         def _assign_ids(label: Label):
             label.assign_schema_ids(ontology_builder)
             return label
+
         self._fns['assign_schema_ids'] = _assign_ids
         return self
 
-    def add_urls_to_data(self, signer: Callable[[bytes], str]) -> "LabelGenerator":
+    def add_urls_to_data(self, signer: Callable[[bytes],
+                                                str]) -> "LabelGenerator":
         """
         Updates masks to have `url` attribute
         Doesn't update masks that already have urls
         """
+
         def _add_urls_to_data(label: Label):
             label.add_url_to_data(signer)
             return label
+
         self._fns['_add_urls_to_data'] = _add_urls_to_data
         return self
 
-    def add_to_dataset(self, dataset, signer: Callable[[bytes], str]) -> "LabelGenerator":
+    def add_to_dataset(self, dataset,
+                       signer: Callable[[bytes], str]) -> "LabelGenerator":
+
         def _add_to_dataset(label: Label):
             label.create_data_row(dataset, signer)
             return label
+
         self._fns['assign_datarow_ids'] = _add_to_dataset
         return self
 
-    def add_urls_to_masks(self, signer: Callable[[bytes], str]) -> "LabelGenerator":
+    def add_urls_to_masks(self, signer: Callable[[bytes],
+                                                 str]) -> "LabelGenerator":
         """
         Updates masks to have `url` attribute
         Doesn't update masks that already have urls
         """
+
         def _add_urls_to_masks(label: Label):
             label.add_url_to_masks(signer)
             return label
+
         self._fns['add_urls_to_masks'] = _add_urls_to_masks
         return self
 
+    def __next__(self):
+        """
+        - Double check that all values have been set.
+        - Items could have been processed before any of these modifying functions are called.
+        - None of these functions do anything if run more than once so the cost is minimal.
+        """
+        value = super().__next__()
+        return self.process(value)
 
 
 LabelData = Union[LabelCollection, LabelGenerator]
@@ -49,6 +49,7 @@ def data(self) -> np.ndarray:
             raise ValueError("Must set either url, file_path or im_bytes")
 
     def create_url(self, signer: Callable[[bytes], str]) -> None:
+
         if self.url is not None:
             return self.url
         elif self.im_bytes is not None:
 
@@ -1,9 +1,8 @@
 from typing import Callable, Optional
 
 import requests
-from pydantic import ValidationError, root_validator
-
 from labelbox.data.annotation_types.reference import DataRowRef
+from pydantic import ValidationError, root_validator
 
 
 class TextData(DataRowRef):
@@ -49,7 +48,7 @@ def validate_date(cls, values):
         url = values.get("url")
         uid = values.get('uid')
         if uid == file_path == text == url == None:
-            raise ValidationError(
+            raise ValueError(
                 "One of `file_path`, `text`, `uid`, or `url` required.")
         return values
 
 
@@ -1,14 +1,13 @@
 import logging
-from uuid import uuid4
 import os
-from typing import Generator, Callable, Optional, Tuple, Dict, Any
+import urllib.request
+from typing import Any, Callable, Dict, Generator, Optional, Tuple
+from uuid import uuid4
 
 import cv2
-import urllib.request
 import numpy as np
-from pydantic import ValidationError, root_validator
-
 from labelbox.data.annotation_types.reference import DataRowRef
+from pydantic import ValidationError, root_validator
 
 logger = logging.getLogger(__name__)
 
@@ -104,7 +103,7 @@ def validate_data(cls, values):
         uid = values.get("uid")
 
         if uid == file_path == frames == url == None:
-            raise ValidationError(
+            raise ValueError(
                 "One of `file_path`, `frames`, `uid`, or `url` required.")
         return values
 
 
@@ -1,9 +1,8 @@
-from typing import Dict, Any
+from typing import Any, Dict
 
-import numpy as np
 import cv2
 import geojson
-
+import numpy as np
 from labelbox.data.annotation_types.geometry.geometry import Geometry
 from labelbox.data.annotation_types.geometry.point import Point
 
 
@@ -1,8 +1,9 @@
 from typing import Any, Dict, List, Union
 
-from labelbox.data.annotation_types.annotation import (
-    AnnotationType, ClassificationAnnotation, ObjectAnnotation,
-    VideoAnnotationType)
+from labelbox.data.annotation_types.annotation import (AnnotationType,
+                                                       ClassificationAnnotation,
+                                                       ObjectAnnotation,
+                                                       VideoAnnotationType)
 from labelbox.data.annotation_types.classification.classification import \
     ClassificationAnswer
 from labelbox.data.annotation_types.data.raster import RasterData
@@ -27,7 +28,7 @@ def add_url_to_data(self, signer):
         self.data.create_url(signer)
         return self
 
-    def add_url_to_masks(self, signer):
+    def add_url_to_masks(self, signer) -> "Label":
         masks = []
         for annotation in self.annotations:
             # Allows us to upload shared masks once
@@ -39,16 +40,17 @@ def add_url_to_masks(self, signer):
         return self
 
     def create_data_row(self, dataset, signer):
-        args = {
-            'row_data' : self.add_url_to_data(signer)
-        }
+        """
+        Only overwrites if necessary
+
+        """
+        args = {'row_data': self.add_url_to_data(signer)}
         if self.data.external_id is not None:
-            args.update({
-                'external'
-            })
-        data_row = dataset.create_data_row(**args)
-        self.data.uid = data_row.uid
-        self.data.external_id = data_row.external_id
+            args.update({'external'})
+        if self.data.uid is None:
+            data_row = dataset.create_data_row(**args)
+            self.data.uid = data_row.uid
+            self.data.external_id = data_row.external_id
         return self
 
     def get_feature_schema_lookup(self, ontology_builder):