Merge branch 'develop' into PTDT-1107

Tim-Kerr · Tim-Kerr · commit 2087c8675cac · 2023-03-03T12:58:45.000-07:00
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -89,4 +89,4 @@ jobs:
 
           DA_GCP_LABELBOX_API_KEY: ${{ secrets[matrix.da-test-key] }}
         run: |
-          tox -e py -- -svv --reruns 5 --reruns-delay 10
+          tox -e py -- -n 10 -svv --reruns 5 --reruns-delay 10
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,20 +3,22 @@
 # Version 3.40.0 (YYYY-MM-DD)
 
 ## Added 
-* Upsert data rows to model runs using global keys
+* Insert newest changelogs here
 
 # Version 3.39.0 (2023-02-28)
 ## Added
 * New method `Project.task_queues()` to obtain the task queues for a project.
 * New method `Project.move_data_rows_to_task_queue()` for moving data rows to a specified task queue.
 * Added more descriptive error messages for metadata operations
 * Added `Task.errors_url` for async tasks that return errors as separate file (e.g. `export_v2`)
+* Upsert data rows to model runs using global keys
 
 ## Changed
 * Updated `ProjectExportParams.labels` to `ProjectExportParams.label_details`
 * Removed `media_attributes` from `DataRowParams`
 * Added deprecation warnings for `LabelList` and removed its usage
 * Removed unused arguments in `Project.export_v2` and `ModelRun.export_v2`
+* In `Project.label_generator()`, we now filter skipped labels for project with videos
 
 ## Notebooks
 * Fixed `examples/label_export/images.ipynb` notebook metadata
diff --git a/examples/README.md b/examples/README.md
@@ -36,8 +36,8 @@ Train a model using data annotated on Labelbox
 
 | Notebook                    | Github                            | Google Colab |
 | --------------------------- | --------------------------------- | ------------ |
-| Object Detection (Detectron2)            | [Github](examples/integrations/detectron2/coco_object.ipynb)  | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Labelbox/labelbox-python/blob/master/examples/integrations/detectron2/coco_object.ipynb) |
-| Panoptic Detection (Detectron2)          | [Github](examples/integrations/detectron2/coco_panoptic.ipynb)    | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Labelbox/labelbox-python/blob/master/examples/integrations/detectron2/coco_panoptic.ipynb)
+| Object Detection (Detectron2)            | [Github](integrations/detectron2/coco_object.ipynb)  | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Labelbox/labelbox-python/blob/master/examples/integrations/detectron2/coco_object.ipynb) |
+| Panoptic Detection (Detectron2)          | [Github](integrations/detectron2/coco_panoptic.ipynb)    | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Labelbox/labelbox-python/blob/master/examples/integrations/detectron2/coco_panoptic.ipynb)
 ------
 
 ## [Annotation Import (Ground Truth & MAL)](annotation_import)
diff --git a/examples/integrations/detectron2/coco_object.ipynb b/examples/integrations/detectron2/coco_object.ipynb
@@ -111,6 +111,7 @@
         "import labelbox as lb\n",
         "import labelbox.types as lb_types\n",
         "from labelbox.data.serialization import COCOConverter\n",
+        "from labelbox.schema.queue_mode import QueueMode\n",
         "\n",
         "from labelbox.data.metrics import (\n",
         "    feature_miou_metric, \n",
@@ -544,7 +545,7 @@
       "metadata": {},
       "source": [
         "# Create a new project for upload\n",
-        "project = client.create_project(name=\"detectron_mal_project\", media_type=lb.MediaType.Image)\n",
+        "project = client.create_project(name=\"detectron_mal_project\", media_type=lb.MediaType.Image, queue_mode=QueueMode.Dataset)\n",
         "editor = next(\n",
         "    client.get_labeling_frontends(where=lb.LabelingFrontend.name == 'editor'))\n",
         "project.setup(editor, labels_mal.get_ontology().asdict())\n",
diff --git a/labelbox/schema/model_run.py b/labelbox/schema/model_run.py
@@ -467,8 +467,6 @@ def export_v2(self,
         create_task_query_str = """mutation exportDataRowsInModelRunPyApi($input: ExportDataRowsInModelRunInput!){
           %s(input: $input) {taskId} }
           """ % (mutation_name)
-        if (task_name is None):
-            task_name = f'Export Data Rows in Model Run - {self.name}'
 
         _params = params or ModelRunExportParams()
 
diff --git a/labelbox/schema/project.py b/labelbox/schema/project.py
@@ -411,9 +411,6 @@ def export_v2(self,
                   task_name: Optional[str] = None,
                   params: Optional[ProjectExportParams] = None) -> Task:
 
-        if (task_name is None):
-            task_name = f'Export Data Rows in Project - {self.name}'
-
         _params = params or ProjectExportParams({
             "attachments": False,
             "metadata_fields": False,
@@ -669,16 +666,20 @@ def setup(self, labeling_frontend, labeling_frontend_options) -> None:
         timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
         self.update(setup_complete=timestamp)
 
-    def create_batch(self,
-                     name: str,
-                     data_rows: List[Union[str, DataRow]],
-                     priority: int = 5,
-                     consensus_settings: Optional[Dict[str, float]] = None):
-        """Create a new batch for a project. Batches is in Beta and subject to change
+    def create_batch(
+        self,
+        name: str,
+        data_rows: Optional[List[Union[str, DataRow]]] = None,
+        priority: int = 5,
+        consensus_settings: Optional[Dict[str, float]] = None,
+        global_keys: Optional[List[str]] = None,
+    ):
+        """Create a new batch for a project. One of `global_keys` or `data_rows` must be provided but not both.
 
         Args:
             name: a name for the batch, must be unique within a project
-            data_rows: Either a list of `DataRows` or Data Row ids
+            data_rows: Either a list of `DataRows` or Data Row ids. 
+            global_keys: global keys for data rows to add to the batch. 
             priority: An optional priority for the Data Rows in the Batch. 1 highest -> 5 lowest
             consensus_settings: An optional dictionary with consensus settings: {'number_of_labels': 3, 'coverage_percentage': 0.1}
         """
@@ -688,35 +689,45 @@ def create_batch(self,
             raise ValueError("Project must be in batch mode")
 
         dr_ids = []
-        for dr in data_rows:
-            if isinstance(dr, Entity.DataRow):
-                dr_ids.append(dr.uid)
-            elif isinstance(dr, str):
-                dr_ids.append(dr)
-            else:
-                raise ValueError("You can DataRow ids or DataRow objects")
+        if data_rows is not None:
+            for dr in data_rows:
+                if isinstance(dr, Entity.DataRow):
+                    dr_ids.append(dr.uid)
+                elif isinstance(dr, str):
+                    dr_ids.append(dr)
+                else:
+                    raise ValueError(
+                        "`data_rows` must be DataRow ids or DataRow objects")
+
+        if data_rows is not None:
+            row_count = len(data_rows)
+        elif global_keys is not None:
+            row_count = len(global_keys)
+        else:
+            row_count = 0
 
-        if len(dr_ids) > 100_000:
+        if row_count > 100_000:
             raise ValueError(
                 f"Batch exceeds max size, break into smaller batches")
-        if not len(dr_ids):
+        if not row_count:
             raise ValueError("You need at least one data row in a batch")
 
         self._wait_until_data_rows_are_processed(
-            dr_ids, self._wait_processing_max_seconds)
+            dr_ids, global_keys, self._wait_processing_max_seconds)
 
         if consensus_settings:
             consensus_settings = ConsensusSettings(**consensus_settings).dict(
                 by_alias=True)
 
         if len(dr_ids) >= 10_000:
-            return self._create_batch_async(name, dr_ids, priority,
+            return self._create_batch_async(name, dr_ids, global_keys, priority,
                                             consensus_settings)
         else:
-            return self._create_batch_sync(name, dr_ids, priority,
+            return self._create_batch_sync(name, dr_ids, global_keys, priority,
                                            consensus_settings)
 
-    def _create_batch_sync(self, name, dr_ids, priority, consensus_settings):
+    def _create_batch_sync(self, name, dr_ids, global_keys, priority,
+                           consensus_settings):
         method = 'createBatchV2'
         query_str = """mutation %sPyApi($projectId: ID!, $batchInput: CreateBatchInput!) {
                   project(where: {id: $projectId}) {
@@ -734,6 +745,7 @@ def _create_batch_sync(self, name, dr_ids, priority, consensus_settings):
             "batchInput": {
                 "name": name,
                 "dataRowIds": dr_ids,
+                "globalKeys": global_keys,
                 "priority": priority,
                 "consensusSettings": consensus_settings
             }
@@ -751,7 +763,8 @@ def _create_batch_sync(self, name, dr_ids, priority, consensus_settings):
 
     def _create_batch_async(self,
                             name: str,
-                            dr_ids: List[str],
+                            dr_ids: Optional[List[str]] = None,
+                            global_keys: Optional[List[str]] = None,
                             priority: int = 5,
                             consensus_settings: Optional[Dict[str,
                                                               float]] = None):
@@ -794,6 +807,7 @@ def _create_batch_async(self,
             "input": {
                 "batchId": batch_id,
                 "dataRowIds": dr_ids,
+                "globalKeys": global_keys,
                 "priority": priority,
             }
         }
@@ -1260,38 +1274,50 @@ def _is_url_valid(url: Union[str, Path]) -> bool:
             raise ValueError(
                 f'Invalid annotations given of type: {type(annotations)}')
 
-    def _wait_until_data_rows_are_processed(self,
-                                            data_row_ids: List[str],
-                                            wait_processing_max_seconds: int,
-                                            sleep_interval=30):
+    def _wait_until_data_rows_are_processed(
+            self,
+            data_row_ids: Optional[List[str]] = None,
+            global_keys: Optional[List[str]] = None,
+            wait_processing_max_seconds: int = _wait_processing_max_seconds,
+            sleep_interval=30):
         """ Wait until all the specified data rows are processed"""
         start_time = datetime.now()
+
         while True:
             if (datetime.now() -
                     start_time).total_seconds() >= wait_processing_max_seconds:
                 raise ProcessingWaitTimeout(
                     "Maximum wait time exceeded while waiting for data rows to be processed. Try creating a batch a bit later"
                 )
 
-            all_good = self.__check_data_rows_have_been_processed(data_row_ids)
+            all_good = self.__check_data_rows_have_been_processed(
+                data_row_ids, global_keys)
             if all_good:
                 return
 
             logger.debug(
                 'Some of the data rows are still being processed, waiting...')
             time.sleep(sleep_interval)
 
-    def __check_data_rows_have_been_processed(self, data_row_ids: List[str]):
-        data_row_ids_param = "data_row_ids"
+    def __check_data_rows_have_been_processed(
+            self,
+            data_row_ids: Optional[List[str]] = None,
+            global_keys: Optional[List[str]] = None):
+
+        if data_row_ids is not None and len(data_row_ids) > 0:
+            param_name = "dataRowIds"
+            params = {param_name: data_row_ids}
+        else:
+            param_name = "globalKeys"
+            global_keys = global_keys if global_keys is not None else []
+            params = {param_name: global_keys}
 
-        query_str = """query CheckAllDataRowsHaveBeenProcessedPyApi($%s: [ID!]!) {
-            queryAllDataRowsHaveBeenProcessed(dataRowIds:$%s) {
+        query_str = """query CheckAllDataRowsHaveBeenProcessedPyApi($%s: [ID!]) {
+            queryAllDataRowsHaveBeenProcessed(%s:$%s) {
                 allDataRowsHaveBeenProcessed
            }
-        }""" % (data_row_ids_param, data_row_ids_param)
+        }""" % (param_name, param_name, param_name)
 
-        params = {}
-        params[data_row_ids_param] = data_row_ids
         response = self.client.execute(query_str, params)
         return response["queryAllDataRowsHaveBeenProcessed"][
             "allDataRowsHaveBeenProcessed"]
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
@@ -231,13 +231,16 @@ def datarow(dataset, image_url):
 
 @pytest.fixture()
 def data_rows(dataset, image_url):
-    dr1 = dataset.create_data_row(row_data=image_url,
-                                  global_key=f"global-key-{uuid.uuid4()}")
-    dr2 = dataset.create_data_row(row_data=image_url,
-                                  global_key=f"global-key-{uuid.uuid4()}")
-    yield [dr1, dr2]
-    dr1.delete()
-    dr2.delete()
+    dr1 = dict(row_data=image_url, global_key=f"global-key-{uuid.uuid4()}")
+    dr2 = dict(row_data=image_url, global_key=f"global-key-{uuid.uuid4()}")
+    task = dataset.create_data_rows([dr1, dr2])
+    task.wait_till_done()
+
+    drs = list(dataset.export_data_rows())
+    yield drs
+
+    for dr in drs:
+        dr.delete()
 
 
 @pytest.fixture
diff --git a/tests/integration/test_batch.py b/tests/integration/test_batch.py
@@ -59,7 +59,9 @@ def test_create_batch_async(batch_project: Project, big_dataset: Dataset):
     data_rows = [dr.uid for dr in list(big_dataset.export_data_rows())]
     batch_project._wait_until_data_rows_are_processed(
         data_rows, batch_project._wait_processing_max_seconds)
-    batch = batch_project._create_batch_async("big-batch", data_rows, 3)
+    batch = batch_project._create_batch_async("big-batch",
+                                              data_rows,
+                                              priority=3)
     assert batch.name == "big-batch"
     assert batch.size == len(data_rows)
 
diff --git a/tests/integration/test_project.py b/tests/integration/test_project.py
@@ -1,6 +1,6 @@
-import json
 import time
 import os
+import uuid
 
 import pytest
 import requests
@@ -244,15 +244,32 @@ def test_batches(batch_project: Project, dataset: Dataset, image_url):
     ] * 2)
     task.wait_till_done()
     data_rows = [dr.uid for dr in list(dataset.export_data_rows())]
-    batch_one = 'batch one'
-    batch_two = 'batch two'
+    batch_one = f'batch one {uuid.uuid4()}'
+    batch_two = f'batch two {uuid.uuid4()}'
     batch_project.create_batch(batch_one, [data_rows[0]])
     batch_project.create_batch(batch_two, [data_rows[1]])
 
     names = set([batch.name for batch in list(batch_project.batches())])
     assert names == {batch_one, batch_two}
 
 
+def test_create_batch_with_global_keys_sync(batch_project: Project, data_rows):
+    global_keys = [dr.global_key for dr in data_rows]
+    batch_name = f'batch {uuid.uuid4()}'
+    batch = batch_project.create_batch(batch_name, global_keys=global_keys)
+    batch_data_rows = set(batch.export_data_rows())
+    assert batch_data_rows == set(data_rows)
+
+
+def test_create_batch_with_global_keys_async(batch_project: Project, data_rows):
+    global_keys = [dr.global_key for dr in data_rows]
+    batch_name = f'batch {uuid.uuid4()}'
+    batch = batch_project._create_batch_async(batch_name,
+                                              global_keys=global_keys)
+    batch_data_rows = set(batch.export_data_rows())
+    assert batch_data_rows == set(data_rows)
+
+
 def test_media_type(client, configured_project: Project, rand_gen):
     # Existing project with no media_type
     assert isinstance(configured_project.media_type, MediaType)