Labelbox
diff --git a/‎labelbox/data/annotation_types/annotation.py‎
Lines changed: 5 additions & 4 deletions b/‎labelbox/data/annotation_types/annotation.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎labelbox/data/annotation_types/collection.py‎
Lines changed: 135 additions & 37 deletions b/‎labelbox/data/annotation_types/collection.py‎
Lines changed: 135 additions & 37 deletions
diff --git a/‎labelbox/data/annotation_types/data/raster.py‎
Lines changed: 6 additions & 7 deletions b/‎labelbox/data/annotation_types/data/raster.py‎
Lines changed: 6 additions & 7 deletions
diff --git a/‎labelbox/data/annotation_types/geometry/mask.py‎
Lines changed: 2 additions & 3 deletions b/‎labelbox/data/annotation_types/geometry/mask.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎labelbox/data/annotation_types/label.py‎
Lines changed: 29 additions & 15 deletions b/‎labelbox/data/annotation_types/label.py‎
Lines changed: 29 additions & 15 deletions
@@ -1,9 +1,10 @@
-from typing import List, Union, Dict, Any
+from typing import Any, Dict, List, Union
 
-from labelbox.data.annotation_types.classification.classification import Dropdown, Text, CheckList, Radio
-from labelbox.data.annotation_types.reference import FeatureSchemaRef
-from labelbox.data.annotation_types.ner import TextEntity
+from labelbox.data.annotation_types.classification.classification import (
+    CheckList, Dropdown, Radio, Text)
 from labelbox.data.annotation_types.geometry import Geometry
+from labelbox.data.annotation_types.ner import TextEntity
+from labelbox.data.annotation_types.reference import FeatureSchemaRef
 
 
 class BaseAnnotation(FeatureSchemaRef):
 
@@ -1,74 +1,172 @@
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import Iterable, List, Any
+from typing import Callable, Generator, Iterable, Union
 from uuid import uuid4
 
-from pydantic import BaseModel
-
 from labelbox.data.annotation_types.label import Label
 from labelbox.orm.model import Entity
+from labelbox.schema.ontology import OntologyBuilder
+from tqdm import tqdm
+
+
+class LabelCollection:
+    """
+    A container for
+
+    """
+    def __init__(self, data: Iterable[Label]):
+        self._data = data
+        self._index = 0
+
+    def __iter__(self):
+        self._index = 0
+        return self
+
+    def __next__(self) -> Label:
+        if self._index == len(self._data):
+            raise StopIteration
+
+        value = self._data[self._index]
+        self._index += 1
+        return value
 
+    def __len__(self) -> int:
+        return len(self._data)
 
-class LabelCollection(BaseModel):
-    data: Iterable[Label]
+    def __getitem__(self, idx: int) -> Label:
+        return self._data[idx]
 
-    def assign_schema_ids(self, ontology_builder):
+    def assign_schema_ids(self, ontology_builder: OntologyBuilder) -> "LabelCollection":
         """
         Based on an ontology:
             - Checks to make sure that the feature names exist in the ontology
             - Updates the names to match the ontology.
         """
-        for label in self.data:
-            for annotation in label.annotations:
-                annotation.assign_schema_ids(ontology_builder)
+        for label in self._data:
+            label.assign_schema_ids(ontology_builder)
+        return self
 
-    def create_dataset(self, client, dataset_name, signer, max_concurrency=20):
+    def _ensure_unique_external_ids(self) -> None:
         external_ids = set()
-        for label in self.data:
+        for label in self._data:
             if label.data.external_id is None:
                 label.data.external_id = uuid4()
             else:
                 if label.data.external_id in external_ids:
                     raise ValueError(
-                        f"External ids must be unique for bulk uploading. Found {label.data.exeternal_id} more than once."
+                        f"External ids must be unique for bulk uploading. Found {label.data.external_id} more than once."
                     )
             external_ids.add(label.data.external_id)
-        labels = self.create_urls_for_data(signer,
+
+    def add_to_dataset(self, dataset, signer, max_concurrency=20) -> "LabelCollection":
+        """
+        # It is reccomended to create a new dataset if memory is a concern
+        # Also note that this relies on exported data that it cached.
+        # So this will not work on the same dataset more frequently than every 30 min.
+        # The workaround is creating a new dataset
+        """
+        self._ensure_unique_external_ids()
+        self.add_urls_to_data(signer,
                                            max_concurrency=max_concurrency)
-        dataset = client.create_dataset(name=dataset_name)
-        upload_task = dataset.create_data_row(
-            {Entity.DataRow.row_data: label.data.url for label in labels})
+        upload_task = dataset.create_data_rows(
+            [{Entity.DataRow.row_data: label.data.url, Entity.DataRow.external_id: label.data.external_id} for label in self._data]
+        )
         upload_task.wait_til_done()
 
-        data_rows = {
+        data_row_lookup = {
             data_row.external_id: data_row.uid
             for data_row in dataset.export_data_rows()
         }
-        for label in self.data:
-            data_row = data_rows[label.data.external_id]
-            label.data.uid = data_row.uid
+        for label in self._data:
+            label.data.uid = data_row_lookup[label.data.external_id]
+        return self
 
-    def create_urls_for_masks(self, signer, max_concurrency=20):
+    def add_urls_to_masks(self, signer, max_concurrency=20) -> "LabelCollection":
         """
         Creates a data row id for each data row that needs it. If the data row exists then it skips the row.
         TODO: Add error handling..
         """
-        futures = {}
-        with ThreadPoolExecutor(max_workers=max_concurrency) as executor:
-            for label in self.data:
-                futures[executor.submit(label.create_url_for_masks)] = label
-            for future in as_completed(futures):
-                # Yields the label. But this function modifies the objects to have updated urls.
-                yield futures[future]
-                del futures[future]
-
-    def create_urls_for_data(self, signer, max_concurrency=20):
+        for row in self._apply_threaded([label.add_url_to_masks for label in self._data], max_concurrency, signer):
+            ...
+        return self
+
+    def add_urls_to_data(self, signer, max_concurrency=20) -> "LabelCollection":
         """
         TODO: Add error handling..
         """
-        futures = {}
+        for row in self._apply_threaded([label.add_url_to_data for label in self._data], max_concurrency, signer):
+            ...
+        return self
+
+    def _apply_threaded(self, fns, max_concurrency, *args):
+        futures = []
         with ThreadPoolExecutor(max_workers=max_concurrency) as executor:
-            for label in self.data:
-                futures[executor.submit(label.create_url_for_data)] = label
-            for future in as_completed(futures):
-                yield futures[future]
-                del futures[future]
+            for fn in fns:
+                futures.append(executor.submit(fn, *args))
+            for future in tqdm(as_completed(futures)):
+                yield future.result()
+
+class LabelGenerator:
+    """
+    Use this class if you have larger data. It is slightly harder to work with
+    than the LabelCollection but will be much more memory efficient.
+    """
+    def __init__(self, data: Generator[Label, None,None]):
+        if isinstance(data, (list, tuple)):
+            self._data = (r for r in data)
+        else:
+            self._data = data
+        self._fns = {}
+
+    def __iter__(self):
+        return self
+
+    def __next__(self) -> Label:
+        # Maybe some sort of prefetching could be nice
+        # to make things faster if users are applying io functions
+        value = next(self._data)
+        for fn in self._fns.values():
+            value = fn(value)
+        return value
+
+    def as_collection(self) -> "LabelCollection":
+        return LabelCollection(data = list(self._data))
+
+    def assign_schema_ids(self, ontology_builder: OntologyBuilder) -> "LabelGenerator":
+        def _assign_ids(label: Label):
+            label.assign_schema_ids(ontology_builder)
+            return label
+        self._fns['assign_schema_ids'] = _assign_ids
+        return self
+
+    def add_urls_to_data(self, signer: Callable[[bytes], str]) -> "LabelGenerator":
+        """
+        Updates masks to have `url` attribute
+        Doesn't update masks that already have urls
+        """
+        def _add_urls_to_data(label: Label):
+            label.add_url_to_data(signer)
+            return label
+        self._fns['_add_urls_to_data'] = _add_urls_to_data
+        return self
+
+    def add_to_dataset(self, dataset, signer: Callable[[bytes], str]) -> "LabelGenerator":
+        def _add_to_dataset(label: Label):
+            label.create_data_row(dataset, signer)
+            return label
+        self._fns['assign_datarow_ids'] = _add_to_dataset
+        return self
+
+    def add_urls_to_masks(self, signer: Callable[[bytes], str]) -> "LabelGenerator":
+        """
+        Updates masks to have `url` attribute
+        Doesn't update masks that already have urls
+        """
+        def _add_urls_to_masks(label: Label):
+            label.add_url_to_masks(signer)
+            return label
+        self._fns['add_urls_to_masks'] = _add_urls_to_masks
+        return self
+
+
+
+LabelData = Union[LabelCollection, LabelGenerator]
@@ -1,12 +1,11 @@
-from typing import Callable, Dict, Any, Optional
 from io import BytesIO
+from typing import Any, Callable, Dict, Optional
 
-from PIL import Image
 import numpy as np
 import requests
-from pydantic import ValidationError, root_validator
-
 from labelbox.data.annotation_types.reference import DataRowRef
+from PIL import Image
+from pydantic import ValidationError, root_validator
 
 
 class RasterData(DataRowRef):
@@ -72,16 +71,16 @@ def validate_args(cls, values):
         arr = values.get("arr")
         uid = values.get('uid')
         if uid == file_path == im_bytes == url == None and arr is None:
-            raise ValidationError(
+            raise ValueError(
                 "One of `file_path`, `im_bytes`, `url`, `uid` or `arr` required."
             )
         if arr is not None:
             if arr.dtype != np.uint8:
-                raise ValidationError(
+                raise TypeError(
                     "Numpy array representing segmentation mask must be np.uint8"
                 )
             elif len(arr.shape) not in [2, 3]:
-                raise ValidationError(
+                raise TypeError(
                     f"Numpy array must have 2 or 3 dims. Found shape {arr.shape}"
                 )
         return values
 
@@ -1,12 +1,11 @@
 from typing import Any, Dict, Tuple
 
 import numpy as np
+from labelbox.data.annotation_types.data.raster import RasterData
+from labelbox.data.annotation_types.geometry.geometry import Geometry
 from rasterio.features import shapes
 from shapely.geometry import MultiPolygon, shape
 
-from labelbox.data.annotation_types.geometry.geometry import Geometry
-from labelbox.data.annotation_types.data.raster import RasterData
-
 
 class Mask(Geometry):
     # Raster data can be shared across multiple masks... or not
 
@@ -1,26 +1,33 @@
-from typing import Union, List, Dict, Any
+from typing import Any, Dict, List, Union
 
-from pydantic import BaseModel
-
-from labelbox.schema.ontology import Classification as OClassification, Option
-from labelbox.data.annotation_types.classification.classification import ClassificationAnswer
-from labelbox.data.annotation_types.annotation import AnnotationType, ClassificationAnnotation, ObjectAnnotation, VideoAnnotationType
+from labelbox.data.annotation_types.annotation import (
+    AnnotationType, ClassificationAnnotation, ObjectAnnotation,
+    VideoAnnotationType)
+from labelbox.data.annotation_types.classification.classification import \
+    ClassificationAnswer
 from labelbox.data.annotation_types.data.raster import RasterData
 from labelbox.data.annotation_types.data.text import TextData
 from labelbox.data.annotation_types.data.video import VideoData
-from labelbox.data.annotation_types.metrics import Metric
 from labelbox.data.annotation_types.geometry.mask import Mask
+from labelbox.data.annotation_types.metrics import Metric
+from labelbox.schema.ontology import Classification as OClassification
+from labelbox.schema.ontology import Option
+from pydantic import BaseModel
 
 
 class Label(BaseModel):
     data: Union[VideoData, RasterData, TextData]
     annotations: List[Union[AnnotationType, VideoAnnotationType, Metric]] = []
     extra: Dict[str, Any] = {}
 
-    def create_url_for_data(self, signer):
-        return self.data.create_url(signer)
+    def add_url_to_data(self, signer):
+        """
+        Only creates a url if one doesn't exist
+        """
+        self.data.create_url(signer)
+        return self
 
-    def create_url_for_masks(self, signer):
+    def add_url_to_masks(self, signer):
         masks = []
         for annotation in self.annotations:
             # Allows us to upload shared masks once
@@ -29,12 +36,20 @@ def create_url_for_masks(self, signer):
                     masks.append(annotation.value.mask)
         for mask in masks:
             mask.create_url(signer)
+        return self
 
     def create_data_row(self, dataset, signer):
-        data_row = dataset.create_data_row(
-            row_data=self.create_url_for_data(signer))
+        args = {
+            'row_data' : self.add_url_to_data(signer)
+        }
+        if self.data.external_id is not None:
+            args.update({
+                'external'
+            })
+        data_row = dataset.create_data_row(**args)
         self.data.uid = data_row.uid
-        return data_row
+        self.data.external_id = data_row.external_id
+        return self
 
     def get_feature_schema_lookup(self, ontology_builder):
         tool_lookup = {}
@@ -64,8 +79,6 @@ def flatten_classification(classifications):
     def assign_schema_ids(self, ontology_builder):
         """
         Classifications get flattened when labeling.
-
-
         """
 
         def assign_or_raise(annotation, lookup):
@@ -106,3 +119,4 @@ def assign_option(classification, lookup):
             else:
                 raise TypeError(
                     f"Unexpected type found for annotation. {type(annotation)}")
+        return self