Merge pull request #755 from Labelbox/sdubinin/al-4081

dubininsergey · web-flow · commit 0f38024d872d · 2022-11-14T21:23:58.000+05:00
[AL-4081] Wait for data rows to be processed when creating a batch
diff --git a/labelbox/__init__.py b/labelbox/__init__.py
@@ -27,4 +27,4 @@
 from labelbox.schema.resource_tag import ResourceTag
 from labelbox.schema.project_resource_tag import ProjectResourceTag
 from labelbox.schema.media_type import MediaType
-from labelbox.schema.slice import Slice, CatalogSlice
+from labelbox.schema.slice import Slice, CatalogSlice
diff --git a/labelbox/exceptions.py b/labelbox/exceptions.py
@@ -129,3 +129,8 @@ class MALValidationError(LabelboxError):
 class OperationNotAllowedException(Exception):
     """Raised when user does not have permissions to a resource or has exceeded usage limit"""
     pass
+
+
+class ProcessingWaitTimeout(Exception):
+    """Raised when waiting for the data rows to be processed takes longer than allowed"""
+    pass
diff --git a/labelbox/schema/batch.py b/labelbox/schema/batch.py
@@ -37,9 +37,15 @@ class Batch(DbObject):
     # Relationships
     created_by = Relationship.ToOne("User")
 
-    def __init__(self, client, project_id, *args, **kwargs):
+    def __init__(self,
+                 client,
+                 project_id,
+                 *args,
+                 failed_data_row_ids=None,
+                 **kwargs):
         super().__init__(client, *args, **kwargs)
         self.project_id = project_id
+        self._failed_data_row_ids = failed_data_row_ids
 
     def project(self) -> 'Project':  # type: ignore
         """ Returns Project which this Batch belongs to
@@ -174,3 +180,7 @@ def delete_labels(self, set_labels_as_template=False) -> None:
                 },
             experimental=True)
         return res
+
+    @property
+    def failed_data_row_ids(self):
+        return (x for x in self._failed_data_row_ids)
diff --git a/labelbox/schema/project.py b/labelbox/schema/project.py
@@ -4,16 +4,17 @@
 from collections import namedtuple
 from datetime import datetime, timezone
 from pathlib import Path
-from typing import TYPE_CHECKING, Dict, Union, Iterable, List, Optional, Any
+from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Union
 from urllib.parse import urlparse
 
 import ndjson
 import requests
 
 from labelbox import utils
-from labelbox.exceptions import InvalidQueryError, LabelboxError
+from labelbox.exceptions import (InvalidQueryError, LabelboxError,
+                                 ProcessingWaitTimeout)
 from labelbox.orm import query
-from labelbox.orm.db_object import DbObject, Updateable, Deletable
+from labelbox.orm.db_object import DbObject, Deletable, Updateable
 from labelbox.orm.model import Entity, Field, Relationship
 from labelbox.pagination import PaginatedCollection
 from labelbox.schema.consensus_settings import ConsensusSettings
@@ -90,6 +91,9 @@ class Project(DbObject, Updateable, Deletable):
     benchmarks = Relationship.ToMany("Benchmark", False)
     ontology = Relationship.ToOne("Ontology", True)
 
+    #
+    _wait_processing_max_seconds = 3600
+
     def update(self, **kwargs):
         """ Updates this project with the specified attributes
 
@@ -319,7 +323,7 @@ def _validate_datetime(string_date: str) -> bool:
                         return True
                     except ValueError:
                         pass
-                raise ValueError(f"""Incorrect format for: {string_date}. 
+                raise ValueError(f"""Incorrect format for: {string_date}.
                 Format must be \"YYYY-MM-DD\" or \"YYYY-MM-DD hh:mm:ss\"""")
             return True
 
@@ -595,11 +599,16 @@ def create_batch(self,
         if not len(dr_ids):
             raise ValueError("You need at least one data row in a batch")
 
-        method = 'createBatch'
+        self._wait_until_data_rows_are_processed(
+            data_rows, self._wait_processing_max_seconds)
+        method = 'createBatchV2'
         query_str = """mutation %sPyApi($projectId: ID!, $batchInput: CreateBatchInput!) {
               project(where: {id: $projectId}) {
                 %s(input: $batchInput) {
-                  %s
+                    batch {
+                        %s
+                    }
+                    failedDataRowIds
                 }
               }
             }
@@ -622,9 +631,12 @@ def create_batch(self,
                                   params,
                                   timeout=180.0,
                                   experimental=True)["project"][method]
-
-        res['size'] = len(dr_ids)
-        return Entity.Batch(self.client, self.uid, res)
+        batch = res['batch']
+        batch['size'] = len(dr_ids)
+        return Entity.Batch(self.client,
+                            self.uid,
+                            batch,
+                            failed_data_row_ids=res['failedDataRowIds'])
 
     def _update_queue_mode(self, mode: "QueueMode") -> "QueueMode":
         """
@@ -977,6 +989,42 @@ def _is_url_valid(url: Union[str, Path]) -> bool:
             raise ValueError(
                 f'Invalid annotations given of type: {type(annotations)}')
 
+    def _wait_until_data_rows_are_processed(self,
+                                            data_row_ids: List[str],
+                                            wait_processing_max_seconds: int,
+                                            sleep_interval=30):
+        """ Wait until all the specified data rows are processed"""
+        start_time = datetime.now()
+        while True:
+            if (datetime.now() -
+                    start_time).total_seconds() >= wait_processing_max_seconds:
+                raise ProcessingWaitTimeout(
+                    "Maximum wait time exceeded while waiting for data rows to be processed. Try creating a batch a bit later"
+                )
+
+            all_good = self.__check_data_rows_have_been_processed(data_row_ids)
+            if all_good:
+                return
+
+            logger.debug(
+                'Some of the data rows are still being processed, waiting...')
+            time.sleep(sleep_interval)
+
+    def __check_data_rows_have_been_processed(self, data_row_ids: List[str]):
+        data_row_ids_param = "data_row_ids"
+
+        query_str = """query CheckAllDataRowsHaveBeenProcessedPyApi($%s: [ID!]!) {
+            queryAllDataRowsHaveBeenProcessed(dataRowIds:$%s) {
+                allDataRowsHaveBeenProcessed
+           }
+        }""" % (data_row_ids_param, data_row_ids_param)
+
+        params = {}
+        params[data_row_ids_param] = data_row_ids
+        response = self.client.execute(query_str, params)
+        return response["queryAllDataRowsHaveBeenProcessed"][
+            "allDataRowsHaveBeenProcessed"]
+
 
 class ProjectMember(DbObject):
     user = Relationship.ToOne("User", cache=True)
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
@@ -191,6 +191,13 @@ def dataset(client, rand_gen):
     dataset.delete()
 
 
+@pytest.fixture(scope='function')
+def unique_dataset(client, rand_gen):
+    dataset = client.create_dataset(name=rand_gen(str))
+    yield dataset
+    dataset.delete()
+
+
 @pytest.fixture
 def datarow(dataset, image_url):
     task = dataset.create_data_rows([
diff --git a/tests/integration/test_batch.py b/tests/integration/test_batch.py
@@ -1,5 +1,5 @@
+from labelbox.exceptions import ProcessingWaitTimeout
 import pytest
-
 from labelbox import Dataset, Project
 
 IMAGE_URL = "https://storage.googleapis.com/diagnostics-demo-data/coco/COCO_train2014_000000000034.jpg"
@@ -31,6 +31,23 @@ def small_dataset(dataset: Dataset):
     yield dataset
 
 
+@pytest.fixture(scope='function')
+def dataset_with_invalid_data_rows(unique_dataset: Dataset):
+    upload_invalid_data_rows_for_dataset(unique_dataset)
+
+    yield unique_dataset
+
+
+def upload_invalid_data_rows_for_dataset(dataset: Dataset):
+    task = dataset.create_data_rows([
+        {
+            "row_data": 'gs://lb-test-private/mask-2.png',  # forbidden
+            "external_id": "image-without-access.jpg"
+        },
+    ] * 2)
+    task.wait_till_done()
+
+
 def test_create_batch(batch_project: Project, big_dataset: Dataset):
     data_rows = [dr.uid for dr in list(big_dataset.export_data_rows())]
     batch = batch_project.create_batch("test-batch", data_rows, 3)
@@ -72,12 +89,63 @@ def test_batch_project(batch_project: Project, small_dataset: Dataset):
     data_rows = [dr.uid for dr in list(small_dataset.export_data_rows())]
     batch = batch_project.create_batch("batch to test project relationship",
                                        data_rows)
+
     project_from_batch = batch.project()
 
     assert project_from_batch.uid == batch_project.uid
     assert project_from_batch.name == batch_project.name
 
 
+def test_batch_creation_for_data_rows_with_issues(
+        batch_project: Project, small_dataset: Dataset,
+        dataset_with_invalid_data_rows: Dataset):
+    """
+    Create a batch containing both valid and invalid data rows
+    """
+    valid_data_rows = [dr.uid for dr in list(small_dataset.data_rows())]
+    invalid_data_rows = [
+        dr.uid for dr in list(dataset_with_invalid_data_rows.data_rows())
+    ]
+    data_rows_to_add = valid_data_rows + invalid_data_rows
+
+    assert len(data_rows_to_add) == 5
+    batch = batch_project.create_batch("batch to test failed data rows",
+                                       data_rows_to_add)
+    failed_data_row_ids = [x for x in batch.failed_data_row_ids]
+    assert len(failed_data_row_ids) == 2
+
+    failed_data_row_ids_set = set(failed_data_row_ids)
+    invalid_data_rows_set = set(invalid_data_rows)
+    assert len(failed_data_row_ids_set.intersection(invalid_data_rows_set)) == 2
+
+
+def test_batch_creation_with_processing_timeout(batch_project: Project,
+                                                small_dataset: Dataset,
+                                                unique_dataset: Dataset):
+    """
+    Create a batch with zero wait time, this means that the waiting logic will throw exception immediately
+    """
+    #  wait for these data rows to be processed
+    valid_data_rows = [dr.uid for dr in list(small_dataset.data_rows())]
+    batch_project._wait_until_data_rows_are_processed(
+        valid_data_rows, wait_processing_max_seconds=3600, sleep_interval=5)
+
+    # upload data rows for this dataset and don't wait
+    upload_invalid_data_rows_for_dataset(unique_dataset)
+    unprocessed_data_rows = [dr.uid for dr in list(unique_dataset.data_rows())]
+
+    data_row_ids = valid_data_rows + unprocessed_data_rows
+
+    stashed_wait_timeout = batch_project._wait_processing_max_seconds
+    with pytest.raises(ProcessingWaitTimeout):
+        # emulate the situation where there are still some data rows being
+        # processed but wait timeout exceeded
+        batch_project._wait_processing_max_seconds = 0
+        batch_project.create_batch("batch to test failed data rows",
+                                   data_row_ids)
+    batch_project._wait_processing_max_seconds = stashed_wait_timeout
+
+
 def test_export_data_rows(batch_project: Project, dataset: Dataset):
     n_data_rows = 5
     task = dataset.create_data_rows([