Merge remote-tracking branch 'origin/develop' into farkob/schema_id_name

farkob · farkob · commit 8a44aaa1ed07 · 2022-07-11T21:25:22.000+03:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,19 @@
 # Changelog
 
+# Version 3.24.1 (2022-07-07)
+## Updated
+* Added `refresh_ontology()` as part of create/update/delete metadata schema functions
+
+# Version 3.24.0 (2022-07-06)
+## Added
+* `DataRowMetadataOntology` class now has functions to create/update/delete metadata schema
+    * `create_schema` - Create custom metadata schema
+    * `update_schema` - Update name of custom metadata schema
+    * `update_enum_options` - Update name of an Enum option for an Enum custom metadata schema
+    * `delete_schema` - Delete custom metadata schema
+* `ModelRun` class now has `assign_data_rows_to_split` function, which can assign a `DataSplit` to a list of `DataRow`s
+* `Dataset.create_data_rows()` can bulk import `conversationalData`
+
 # Version 3.23.3 (2022-06-23)
 
 ## Fix
diff --git a/README.md b/README.md
@@ -85,7 +85,7 @@ client = Client( endpoint = "<local deployment>")
 client = Client(api_key=os.environ['LABELBOX_TEST_API_KEY_LOCAL'], endpoint="http://localhost:8080/graphql")
 
 # Staging
-client = Client(api_key=os.environ['LABELBOX_TEST_API_KEY_LOCAL'], endpoint="https://staging-api.labelbox.com/graphql")
+client = Client(api_key=os.environ['LABELBOX_TEST_API_KEY_LOCAL'], endpoint="https://api.lb-stage.xyz/graphql")
 ```
 
 ## Contribution
@@ -122,5 +122,5 @@ make test-prod # with an optional flag: PATH_TO_TEST=tests/integration/...etc LA
 make -B {build|test-staging|test-prod}
 ```
 
-6. Testing against Delegated Access will be skipped unless the local env contains the key: 
-DA_GCP_LABELBOX_API_KEY. These tests will be included when run against a PR. If you would like to test it manually, please reach out to the Devops team for information on the key.
+6. Testing against Delegated Access will be skipped unless the local env contains the key:
+DA_GCP_LABELBOX_API_KEY. These tests will be included when run against a PR. If you would like to test it manually, please reach out to the Devops team for information on the key.
diff --git a/labelbox/__init__.py b/labelbox/__init__.py
@@ -1,5 +1,5 @@
 name = "labelbox"
-__version__ = "3.23.3"
+__version__ = "3.24.1"
 
 from labelbox.client import Client
 from labelbox.schema.project import Project
@@ -21,7 +21,7 @@
 from labelbox.schema.role import Role, ProjectRole
 from labelbox.schema.invite import Invite, InviteLimit
 from labelbox.schema.data_row_metadata import DataRowMetadataOntology
-from labelbox.schema.model_run import ModelRun
+from labelbox.schema.model_run import ModelRun, DataSplit
 from labelbox.schema.benchmark import Benchmark
 from labelbox.schema.iam_integration import IAMIntegration
 from labelbox.schema.resource_tag import ResourceTag
diff --git a/labelbox/schema/data_row_metadata.py b/labelbox/schema/data_row_metadata.py
@@ -366,6 +366,7 @@ def delete_schema(self, name: str) -> bool:
         res = self._client.execute(query, {'where': {
             'id': schema.uid
         }})['deleteCustomMetadataSchema']
+        self.refresh_ontology()
 
         return res['success']
 
@@ -642,6 +643,7 @@ def _upsert_schema(
         res = self._client.execute(
             query, {"data": upsert_schema.dict(exclude_none=True)
                    })['upsertCustomMetadataSchema']
+        self.refresh_ontology()
         return _parse_metadata_schema(res)
 
     def _parse_upsert(
diff --git a/labelbox/schema/dataset.py b/labelbox/schema/dataset.py
@@ -226,6 +226,7 @@ def _create_descriptor_file(self, items, max_attachments_per_data_row=None):
         >>>     {DataRow.row_data:"/path/to/file1.jpg"},
         >>>     "path/to/file2.jpg",
         >>>     {"tileLayerUrl" : "http://", ...}
+        >>>     {"conversationalData" : [...], ...}
         >>>     ])
 
         For an example showing how to upload tiled data_rows see the following notebook:
@@ -280,6 +281,33 @@ def validate_attachments(item):
                     )
             return attachments
 
+        def validate_conversational_data(conversational_data: list) -> None:
+            """
+            Checks each conversational message for keys expected as per https://docs.labelbox.com/reference/text-conversational#sample-conversational-json
+
+            Args:
+                conversational_data (list): list of dictionaries.
+            """
+
+            def check_message_keys(message):
+                accepted_message_keys = set([
+                    "messageId", "timestampUsec", "content", "user", "align",
+                    "canLabel"
+                ])
+                for key in message.keys():
+                    if not key in accepted_message_keys:
+                        raise KeyError(
+                            f"Invalid {key} key found! Accepted keys in messages list is {accepted_message_keys}"
+                        )
+
+            if conversational_data and not isinstance(conversational_data,
+                                                      list):
+                raise ValueError(
+                    f"conversationalData must be a list. Found {type(conversational_data)}"
+                )
+
+            [check_message_keys(message) for message in conversational_data]
+
         def parse_metadata_fields(item):
             metadata_fields = item.get('metadata_fields')
             if metadata_fields:
@@ -321,6 +349,27 @@ def convert_item(item):
             if "tileLayerUrl" in item:
                 validate_attachments(item)
                 return item
+
+            if "conversationalData" in item:
+                messages = item.pop("conversationalData")
+                version = item.pop("version")
+                type = item.pop("type")
+                if "externalId" in item:
+                    external_id = item.pop("externalId")
+                    item["external_id"] = external_id
+                validate_conversational_data(messages)
+                one_conversation = \
+                    {
+                        "type": type,
+                        "version": version,
+                        "messages": messages
+                    }
+                conversationUrl = self.client.upload_data(
+                    json.dumps(one_conversation),
+                    content_type="application/json",
+                    filename="conversational_data.json")
+                item["row_data"] = conversationUrl
+
             # Convert all payload variations into the same dict format
             item = format_row(item)
             # Make sure required keys exist (and there are no extra keys)
diff --git a/labelbox/schema/model_run.py b/labelbox/schema/model_run.py
@@ -1,10 +1,12 @@
+# type: ignore
 from typing import TYPE_CHECKING, Dict, Iterable, Union, List, Optional, Any
 from pathlib import Path
 import os
 import time
 import logging
 import requests
 import ndjson
+from enum import Enum
 
 from labelbox.pagination import PaginatedCollection
 from labelbox.orm.query import results_query_part
@@ -17,13 +19,27 @@
 logger = logging.getLogger(__name__)
 
 
+class DataSplit(Enum):
+    TRAINING = "TRAINING"
+    TEST = "TEST"
+    VALIDATION = "VALIDATION"
+    UNASSIGNED = "UNASSIGNED"
+
+
 class ModelRun(DbObject):
     name = Field.String("name")
     updated_at = Field.DateTime("updated_at")
     created_at = Field.DateTime("created_at")
     created_by_id = Field.String("created_by_id", "createdBy")
     model_id = Field.String("model_id")
 
+    class Status(Enum):
+        EXPORTING_DATA = "EXPORTING_DATA"
+        PREPARING_DATA = "PREPARING_DATA"
+        TRAINING_MODEL = "TRAINING_MODEL"
+        COMPLETE = "COMPLETE"
+        FAILED = "FAILED"
+
     def upsert_labels(self, label_ids, timeout_seconds=60):
         """ Adds data rows and labels to a model run
         Args:
@@ -90,8 +106,9 @@ def upsert_data_rows(self, data_row_ids, timeout_seconds=60):
             }})['MEADataRowRegistrationTaskStatus'],
                                      timeout_seconds=timeout_seconds)
 
-    def _wait_until_done(self, status_fn, timeout_seconds=60, sleep_time=5):
+    def _wait_until_done(self, status_fn, timeout_seconds=120, sleep_time=5):
         # Do not use this function outside of the scope of upsert_data_rows or upsert_labels. It could change.
+        original_timeout = timeout_seconds
         while True:
             res = status_fn()
             if res['status'] == 'COMPLETE':
@@ -102,9 +119,8 @@ def _wait_until_done(self, status_fn, timeout_seconds=60, sleep_time=5):
             timeout_seconds -= sleep_time
             if timeout_seconds <= 0:
                 raise TimeoutError(
-                    f"Unable to complete import within {timeout_seconds} seconds."
+                    f"Unable to complete import within {original_timeout} seconds."
                 )
-
             time.sleep(sleep_time)
 
     def add_predictions(
@@ -161,7 +177,7 @@ def delete(self):
             deleteModelRuns(where: {ids: [$%s]})}""" % (ids_param, ids_param)
         self.client.execute(query_str, {ids_param: str(self.uid)})
 
-    def delete_model_run_data_rows(self, data_row_ids):
+    def delete_model_run_data_rows(self, data_row_ids: List[str]):
         """ Deletes data rows from model runs.
 
         Args:
@@ -180,22 +196,62 @@ def delete_model_run_data_rows(self, data_row_ids):
             data_row_ids_param: data_row_ids
         })
 
+    @experimental
+    def assign_data_rows_to_split(self,
+                                  data_row_ids: List[str],
+                                  split: Union[DataSplit, str],
+                                  timeout_seconds=120):
+
+        split_value = split.value if isinstance(split, DataSplit) else split
+
+        if split_value == DataSplit.UNASSIGNED.value:
+            raise ValueError(
+                f"Cannot assign split value of `{DataSplit.UNASSIGNED.value}`.")
+
+        valid_splits = filter(lambda name: name != DataSplit.UNASSIGNED.value,
+                              DataSplit._member_names_)
+
+        if split_value not in valid_splits:
+            raise ValueError(
+                f"`split` must be one of : `{valid_splits}`. Found : `{split}`")
+
+        task_id = self.client.execute(
+            """mutation assignDataSplitPyApi($modelRunId: ID!, $data: CreateAssignDataRowsToDataSplitTaskInput!){
+                  createAssignDataRowsToDataSplitTask(modelRun : {id: $modelRunId}, data: $data)}
+            """, {
+                'modelRunId': self.uid,
+                'data': {
+                    'assignments': [{
+                        'split': split_value,
+                        'dataRowIds': data_row_ids
+                    }]
+                }
+            },
+            experimental=True)['createAssignDataRowsToDataSplitTask']
+
+        status_query_str = """query assignDataRowsToDataSplitTaskStatusPyApi($id: ID!){
+            assignDataRowsToDataSplitTaskStatus(where: {id : $id}){status errorMessage}}
+            """
+
+        return self._wait_until_done(lambda: self.client.execute(
+            status_query_str, {'id': task_id}, experimental=True)[
+                'assignDataRowsToDataSplitTaskStatus'],
+                                     timeout_seconds=timeout_seconds)
+
     @experimental
     def update_status(self,
-                      status: str,
+                      status: Union[str, "ModelRun.Status"],
                       metadata: Optional[Dict[str, str]] = None,
                       error_message: Optional[str] = None):
 
-        valid_statuses = [
-            "EXPORTING_DATA", "PREPARING_DATA", "TRAINING_MODEL", "COMPLETE",
-            "FAILED"
-        ]
-        if status not in valid_statuses:
+        status_value = status.value if isinstance(status,
+                                                  ModelRun.Status) else status
+        if status_value not in ModelRun.Status._member_names_:
             raise ValueError(
-                f"Status must be one of : `{valid_statuses}`. Found : `{status}`"
+                f"Status must be one of : `{ModelRun.Status._member_names_}`. Found : `{status_value}`"
             )
 
-        data: Dict[str, Any] = {'status': status}
+        data: Dict[str, Any] = {'status': status_value}
         if error_message:
             data['errorMessage'] = error_message
 
@@ -264,6 +320,7 @@ def export_labels(
 class ModelRunDataRow(DbObject):
     label_id = Field.String("label_id")
     model_run_id = Field.String("model_run_id")
+    data_split = Field.Enum(DataSplit, "data_split")
     data_row = Relationship.ToOne("DataRow", False, cache=True)
 
     def __init__(self, client, model_id, *args, **kwargs):
diff --git a/tests/integration/annotation_import/test_model_run.py b/tests/integration/annotation_import/test_model_run.py
@@ -2,6 +2,9 @@
 import os
 import pytest
 
+from collections import Counter
+from labelbox import DataSplit, ModelRun
+
 
 def test_model_run(client, configured_project_with_label, rand_gen):
     project, _, _, label = configured_project_with_label
@@ -119,3 +122,40 @@ def get_model_run_status():
     assert model_run_status['status'] == status
     assert model_run_status['metadata'] == {**metadata, **extra_metadata}
     assert model_run_status['errorMessage'] == errorMessage
+
+    status = ModelRun.Status.FAILED
+    model_run_with_model_run_data_rows.update_status(status, metadata,
+                                                     errorMessage)
+    model_run_status = get_model_run_status()
+    assert model_run_status['status'] == status.value
+
+    with pytest.raises(ValueError):
+        model_run_with_model_run_data_rows.update_status(
+            "INVALID", metadata, errorMessage)
+
+
+def test_model_run_split_assignment(model_run, dataset, image_url):
+    n_data_rows = 10
+    data_rows = dataset.create_data_rows([{
+        "row_data": image_url
+    } for _ in range(n_data_rows)])
+    data_row_ids = [data_row['id'] for data_row in data_rows.result]
+
+    model_run.upsert_data_rows(data_row_ids)
+
+    with pytest.raises(ValueError):
+        model_run.assign_data_rows_to_split(data_row_ids, "INVALID SPLIT")
+
+    with pytest.raises(ValueError):
+        model_run.assign_data_rows_to_split(data_row_ids, DataSplit.UNASSIGNED)
+
+    for split in ["TRAINING", "TEST", "VALIDATION", *DataSplit]:
+        if split == DataSplit.UNASSIGNED:
+            continue
+
+        model_run.assign_data_rows_to_split(data_row_ids, split)
+        counts = Counter()
+        for data_row in model_run.model_run_data_rows():
+            counts[data_row.data_split.value] += 1
+        split = split.value if isinstance(split, DataSplit) else split
+        assert counts[split] == n_data_rows
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
@@ -1,3 +1,4 @@
+import json
 import os
 import re
 import time
@@ -145,7 +146,10 @@ def client(environ: str):
 
 @pytest.fixture(scope="session")
 def image_url(client):
-    return client.upload_data(requests.get(IMG_URL).content, sign=True)
+    return client.upload_data(requests.get(IMG_URL).content,
+                              content_type="application/json",
+                              filename="json_import.json",
+                              sign=True)
 
 
 @pytest.fixture
@@ -181,16 +185,23 @@ def iframe_url(environ) -> str:
     if environ in [Environ.PROD, Environ.LOCAL]:
         return 'https://editor.labelbox.com'
     elif environ == Environ.STAGING:
-        return 'https://staging.labelbox.dev/editor'
+        return 'https://editor.lb-stage.xyz'
 
 
 @pytest.fixture
 def sample_video() -> str:
     path_to_video = 'tests/integration/media/cat.mp4'
-    assert os.path.exists(path_to_video)
     return path_to_video
 
 
+@pytest.fixture
+def sample_bulk_conversation() -> list:
+    path_to_conversation = 'tests/integration/media/bulk_conversation.json'
+    with open(path_to_conversation) as json_file:
+        conversations = json.load(json_file)
+    return conversations
+
+
 @pytest.fixture
 def organization(client):
     # Must have at least one seat open in your org to run these tests
@@ -290,7 +301,7 @@ def configured_project_with_label(client, rand_gen, image_url, project, dataset,
 
     def create_label():
         """ Ad-hoc function to create a LabelImport
-        
+
         Creates a LabelImport task which will create a label
         """
         upload_task = LabelImport.create_from_objects(
diff --git a/tests/integration/media/bulk_conversation.json b/tests/integration/media/bulk_conversation.json
diff --git a/tests/integration/test_client_errors.py b/tests/integration/test_client_errors.py
diff --git a/tests/integration/test_dataset.py b/tests/integration/test_dataset.py
diff --git a/tests/integration/test_filtering.py b/tests/integration/test_filtering.py