AL-4081: Extended create_batch method with DRPS logic

dubininsergey · dubininsergey · commit 1823ad66d05b · 2022-11-03T15:11:31.000+05:00
diff --git a/labelbox/__init__.py b/labelbox/__init__.py
@@ -27,4 +27,4 @@
 from labelbox.schema.resource_tag import ResourceTag
 from labelbox.schema.project_resource_tag import ProjectResourceTag
 from labelbox.schema.media_type import MediaType
-from labelbox.schema.slice import Slice, CatalogSlice
+from labelbox.schema.slice import Slice, CatalogSlice
diff --git a/labelbox/client.py b/labelbox/client.py
@@ -751,7 +751,7 @@ def get_data_row_ids_for_external_ids(
             for row in self.execute(
                     query_str,
                 {'externalId_in': external_ids[i:i + max_ids_per_request]
-                })['externalIdsToDataRowIds']:
+                 })['externalIdsToDataRowIds']:
                 result[row['externalId']].append(row['dataRowId'])
         return result
 
@@ -1058,7 +1058,7 @@ def _format_failed_rows(rows: Dict[str, str],
         result_params = {
             "jobId":
                 assign_global_keys_to_data_rows_job["assignGlobalKeysToDataRows"
-                                                   ]["jobId"]
+                                                    ]["jobId"]
         }
 
         # Poll job status until finished, then retrieve results
diff --git a/labelbox/schema/batch.py b/labelbox/schema/batch.py
@@ -36,9 +36,10 @@ class Batch(DbObject):
     # Relationships
     created_by = Relationship.ToOne("User")
 
-    def __init__(self, client, project_id, *args, **kwargs):
+    def __init__(self, client, project_id, *args, failed_data_row_ids=None, **kwargs):
         super().__init__(client, *args, **kwargs)
         self.project_id = project_id
+        self._failed_data_row_ids = failed_data_row_ids
 
     def project(self) -> 'Project':  # type: ignore
         """ Returns Project which this Batch belongs to
@@ -75,7 +76,7 @@ def remove_queued_data_rows(self) -> None:
                 batch_id_param), {
                     project_id_param: self.project_id,
                     batch_id_param: self.uid
-                },
+            },
             experimental=True)
 
     def export_data_rows(self,
@@ -144,8 +145,8 @@ def delete(self) -> None:
                 batch_id_param), {
                     project_id_param: self.project_id,
                     batch_id_param: self.uid
-                },
-                            experimental=True)
+        },
+            experimental=True)
 
     def delete_labels(self, set_labels_as_template=False) -> None:
         """ Deletes labels that were created for data rows in the batch.
@@ -170,6 +171,10 @@ def delete_labels(self, set_labels_as_template=False) -> None:
                     type_param:
                         "RequeueDataWithLabelAsTemplate"
                         if set_labels_as_template else "RequeueData"
-                },
+            },
             experimental=True)
         return res
+
+    @property
+    def failed_data_row_ids(self):
+        return self._failed_data_row_ids
diff --git a/labelbox/schema/project.py b/labelbox/schema/project.py
@@ -4,16 +4,15 @@
 from collections import namedtuple
 from datetime import datetime, timezone
 from pathlib import Path
-from typing import TYPE_CHECKING, Dict, Union, Iterable, List, Optional, Any
+from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Union
 from urllib.parse import urlparse
 
 import ndjson
 import requests
-
 from labelbox import utils
 from labelbox.exceptions import InvalidQueryError, LabelboxError
 from labelbox.orm import query
-from labelbox.orm.db_object import DbObject, Updateable, Deletable
+from labelbox.orm.db_object import DbObject, Deletable, Updateable
 from labelbox.orm.model import Entity, Field, Relationship
 from labelbox.pagination import PaginatedCollection
 from labelbox.schema.media_type import MediaType
@@ -318,7 +317,7 @@ def _validate_datetime(string_date: str) -> bool:
                         return True
                     except ValueError:
                         pass
-                raise ValueError(f"""Incorrect format for: {string_date}. 
+                raise ValueError(f"""Incorrect format for: {string_date}.
                 Format must be \"YYYY-MM-DD\" or \"YYYY-MM-DD hh:mm:ss\"""")
             return True
 
@@ -561,7 +560,7 @@ def setup(self, labeling_frontend, labeling_frontend_options) -> None:
         timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
         self.update(setup_complete=timestamp)
 
-    def create_batch(self, name: str, data_rows: List[str], priority: int = 5):
+    def create_batch(self, name: str, data_rows: List[str], priority: int = 5,  wait_processing_max_seconds: int = 5):
         """Create a new batch for a project. Batches is in Beta and subject to change
 
         Args:
@@ -590,11 +589,18 @@ def create_batch(self, name: str, data_rows: List[str], priority: int = 5):
         if not len(dr_ids):
             raise ValueError("You need at least one data row in a batch")
 
-        method = 'createBatch'
+        self._wait_until_data_rows_are_processed(
+            data_rows,
+            wait_processing_max_seconds=wait_processing_max_seconds
+        )
+        method = 'createBatchV2'
         query_str = """mutation %sPyApi($projectId: ID!, $batchInput: CreateBatchInput!) {
               project(where: {id: $projectId}) {
                 %s(input: $batchInput) {
-                  %s
+                    batch{
+                        %s
+                    }
+                    failedDataRowIds
                 }
               }
             }
@@ -613,9 +619,9 @@ def create_batch(self, name: str, data_rows: List[str], priority: int = 5):
                                   params,
                                   timeout=180.0,
                                   experimental=True)["project"][method]
-
-        res['size'] = len(dr_ids)
-        return Entity.Batch(self.client, self.uid, res)
+        batch = res['batch']
+        batch['size'] = len(dr_ids)
+        return Entity.Batch(self.client, self.uid, batch, failed_data_row_ids=res['failedDataRowIds'])
 
     def _update_queue_mode(self, mode: "QueueMode") -> "QueueMode":
         """
@@ -964,6 +970,34 @@ def _is_url_valid(url: Union[str, Path]) -> bool:
             raise ValueError(
                 f'Invalid annotations given of type: {type(annotations)}')
 
+    def _wait_until_data_rows_are_processed(self, data_row_ids: List[str], wait_processing_max_seconds: int, sleep_interval=30):
+        """ Wait until all the specified data rows are processed"""
+        start_time = datetime.now()
+        while True:
+            if (datetime.now() - start_time).total_seconds() >= wait_processing_max_seconds:
+                logger.warning(
+                    """Not all data rows have been processed, proceeding anyway""")
+                return
+
+            all_good = self.__check_data_rows_have_been_processed(data_row_ids)
+            if all_good:
+                return
+            time.sleep(sleep_interval)
+
+    def __check_data_rows_have_been_processed(self, data_row_ids: List[str]):
+        data_row_ids_param = "data_row_ids"
+
+        query_str = """query CheckAllDataRowsHaveBeenProcessedPyApi($%s: [ID!]!) {
+            queryAllDataRowsHaveBeenProcessed(dataRowIds:$%s) {
+                allDataRowsHaveBeenProcessed
+           }
+        }""" % (data_row_ids_param, data_row_ids_param)
+
+        params = {}
+        params[data_row_ids_param] = data_row_ids
+        response = self.client.execute(query_str, params)
+        return response["queryAllDataRowsHaveBeenProcessed"]["allDataRowsHaveBeenProcessed"]
+
 
 class ProjectMember(DbObject):
     user = Relationship.ToOne("User", cache=True)
diff --git a/pytest.ini b/pytest.ini
@@ -1,4 +1,4 @@
 [pytest]
-addopts = -s -vv -x --reruns 5 --reruns-delay 10 --durations=20
+addopts = -s -vv -x 
 markers =
     slow: marks tests as slow (deselect with '-m "not slow"')
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
@@ -188,6 +188,12 @@ def dataset(client, rand_gen):
     yield dataset
     dataset.delete()
 
+@pytest.fixture(scope='function')
+def unique_dataset(client, rand_gen):
+    dataset = client.create_dataset(name=rand_gen(str))
+    yield dataset
+    dataset.delete()
+
 
 @pytest.fixture
 def datarow(dataset, image_url):
diff --git a/tests/integration/test_batch.py b/tests/integration/test_batch.py
@@ -1,5 +1,6 @@
-import pytest
+import warnings
 
+import pytest
 from labelbox import Dataset, Project
 from labelbox.schema.queue_mode import QueueMode
 
@@ -32,6 +33,23 @@ def small_dataset(dataset: Dataset):
     yield dataset
 
 
+@pytest.fixture(scope='function')
+def dataset_with_invalid_data_rows(unique_dataset: Dataset):
+    upload_invalid_data_rows_for_dataset(unique_dataset)
+
+    yield unique_dataset
+
+
+def upload_invalid_data_rows_for_dataset(dataset: Dataset):
+    task = dataset.create_data_rows([
+        {
+            "row_data": 'https://jakub-da-test-primary.s3.us-east-2.amazonaws.com/dogecoin-whitepaper.pdf',
+            "external_id": "my-pdf"
+        },
+    ] * 2)
+    task.wait_till_done()
+
+
 def test_create_batch(batch_project: Project, big_dataset: Dataset):
     data_rows = [dr.uid for dr in list(big_dataset.export_data_rows())]
     batch = batch_project.create_batch("test-batch", data_rows, 3)
@@ -60,12 +78,74 @@ def test_batch_project(batch_project: Project, small_dataset: Dataset):
     data_rows = [dr.uid for dr in list(small_dataset.export_data_rows())]
     batch = batch_project.create_batch("batch to test project relationship",
                                        data_rows)
+
     project_from_batch = batch.project()
 
     assert project_from_batch.uid == batch_project.uid
     assert project_from_batch.name == batch_project.name
 
 
+def test_batch_creation_for_data_rows_with_issues(
+    batch_project: Project,
+    small_dataset: Dataset,
+    dataset_with_invalid_data_rows: Dataset
+):
+    """
+    Create a batch containing both valid and invalid data rows
+    """
+    valid_data_rows = [dr.uid for dr in list(small_dataset.export_data_rows())]
+    invalid_data_rows = [dr.uid for dr in list(
+        dataset_with_invalid_data_rows.export_data_rows())]
+    data_rows_to_add = valid_data_rows + invalid_data_rows
+
+    assert len(data_rows_to_add) == 5
+    batch = batch_project.create_batch(
+        "batch to test failed data rows",
+        data_rows_to_add
+    )
+
+    assert len(batch.failed_data_row_ids) == 2
+
+    failed_data_row_ids_set = set(batch.failed_data_row_ids)
+    invalid_data_rows_set = set(invalid_data_rows)
+    assert len(failed_data_row_ids_set.intersection(
+        invalid_data_rows_set)) == 2
+
+
+def test_batch_creation_with_processing_timeout(
+    batch_project: Project,
+    small_dataset: Dataset,
+    unique_dataset: Dataset
+):
+    """
+    Create a batch with zero wait time, this means that the waiting will termintate instantly
+    """
+    #  wait for these data rows to be processed
+    valid_data_rows = [dr.uid for dr in list(small_dataset.export_data_rows())]
+    batch_project._wait_until_data_rows_are_processed(
+        valid_data_rows, wait_processing_max_seconds=3600, sleep_interval=5
+    )
+
+    # upload data rows for this dataset and don't wait
+    upload_invalid_data_rows_for_dataset(unique_dataset)
+    unprocessed_data_rows = [dr.uid for dr in list(
+        unique_dataset.export_data_rows())]
+
+    data_row_ids = valid_data_rows + unprocessed_data_rows
+    with warnings.catch_warnings(record=True) as w:
+        warnings.simplefilter("always")
+        breakpoint()
+        batch_project.create_batch(
+            "batch to test failed data rows",
+            data_row_ids,
+            wait_processing_max_seconds=0
+        )
+        assert len(w) == 1
+        assert issubclass(w[-1].category, DeprecationWarning)
+        assert "Not all data rows have been processed, proceeding anyway" in str(
+            w[-1].message)
+
+
 def test_export_data_rows(batch_project: Project, dataset: Dataset):
     n_data_rows = 5
     task = dataset.create_data_rows([