assign data row split

Matt Sokoloff · Matt Sokoloff · commit edb9e1866e30 · 2022-06-29T09:48:13.000-04:00
diff --git a/README.md b/README.md
@@ -85,7 +85,7 @@ client = Client( endpoint = "<local deployment>")
 client = Client(api_key=os.environ['LABELBOX_TEST_API_KEY_LOCAL'], endpoint="http://localhost:8080/graphql")
 
 # Staging
-client = Client(api_key=os.environ['LABELBOX_TEST_API_KEY_LOCAL'], endpoint="https://staging-api.labelbox.com/graphql")
+client = Client(api_key=os.environ['LABELBOX_TEST_API_KEY_LOCAL'], endpoint="https://api.lb-stage.xyz/graphql")
 ```
 
 ## Contribution
@@ -122,5 +122,5 @@ make test-prod # with an optional flag: PATH_TO_TEST=tests/integration/...etc LA
 make -B {build|test-staging|test-prod}
 ```
 
-6. Testing against Delegated Access will be skipped unless the local env contains the key: 
-DA_GCP_LABELBOX_API_KEY. These tests will be included when run against a PR. If you would like to test it manually, please reach out to the Devops team for information on the key.
+6. Testing against Delegated Access will be skipped unless the local env contains the key:
+DA_GCP_LABELBOX_API_KEY. These tests will be included when run against a PR. If you would like to test it manually, please reach out to the Devops team for information on the key.
diff --git a/labelbox/schema/model_run.py b/labelbox/schema/model_run.py
@@ -92,6 +92,7 @@ def upsert_data_rows(self, data_row_ids, timeout_seconds=60):
 
     def _wait_until_done(self, status_fn, timeout_seconds=60, sleep_time=5):
         # Do not use this function outside of the scope of upsert_data_rows or upsert_labels. It could change.
+        original_timeout = timeout_seconds
         while True:
             res = status_fn()
             if res['status'] == 'COMPLETE':
@@ -102,7 +103,7 @@ def _wait_until_done(self, status_fn, timeout_seconds=60, sleep_time=5):
             timeout_seconds -= sleep_time
             if timeout_seconds <= 0:
                 raise TimeoutError(
-                    f"Unable to complete import within {timeout_seconds} seconds."
+                    f"Unable to complete import within {original_timeout} seconds."
                 )
 
             time.sleep(sleep_time)
@@ -180,6 +181,39 @@ def delete_model_run_data_rows(self, data_row_ids):
             data_row_ids_param: data_row_ids
         })
 
+    @experimental
+    def assign_data_rows_to_split(self,
+                                  data_row_ids,
+                                  split,
+                                  timeout_seconds=60):
+        valid_splits = ["TRAINING", "TEST", "VALIDATION"]
+        if split not in valid_splits:
+            raise ValueError(
+                f"split must be one of : `{valid_splits}`. Found : `{split}`")
+
+        task_id = self.client.execute(
+            """mutation assignDataSplitPyApi($modelRunId: ID!, $data: CreateAssignDataRowsToDataSplitTaskInput!){
+                  createAssignDataRowsToDataSplitTask(modelRun : {id: $modelRunId}, data: $data)}
+            """, {
+                'modelRunId': self.uid,
+                'data': {
+                    'assignments': [{
+                        'split': split,
+                        'dataRowIds': data_row_ids
+                    }]
+                }
+            },
+            experimental=True)['createAssignDataRowsToDataSplitTask']
+
+        status_query_str = """query assignDataRowsToDataSplitTaskStatusPyApi($id: ID!){
+            assignDataRowsToDataSplitTaskStatus(where: {id : $id}){status errorMessage}}
+            """
+
+        return self._wait_until_done(lambda: self.client.execute(
+            status_query_str, {'id': task_id}, experimental=True)[
+                'assignDataRowsToDataSplitTaskStatus'],
+                                     timeout_seconds=timeout_seconds)
+
     @experimental
     def update_status(self,
                       status: str,
@@ -264,6 +298,7 @@ def export_labels(
 class ModelRunDataRow(DbObject):
     label_id = Field.String("label_id")
     model_run_id = Field.String("model_run_id")
+    data_split = Field.String("data_split")
     data_row = Relationship.ToOne("DataRow", False, cache=True)
 
     def __init__(self, client, model_id, *args, **kwargs):
diff --git a/tests/integration/annotation_import/test_model_run.py b/tests/integration/annotation_import/test_model_run.py
@@ -2,6 +2,8 @@
 import os
 import pytest
 
+from collections import Counter
+
 
 def test_model_run(client, configured_project_with_label, rand_gen):
     project, _, _, label = configured_project_with_label
@@ -119,3 +121,24 @@ def get_model_run_status():
     assert model_run_status['status'] == status
     assert model_run_status['metadata'] == {**metadata, **extra_metadata}
     assert model_run_status['errorMessage'] == errorMessage
+
+
+def test_model_run_split_assignment(model_run, dataset, image_url):
+    n_data_rows = 10
+    data_rows = dataset.create_data_rows([{
+        "row_data": image_url
+    } for _ in range(n_data_rows)])
+    data_row_ids = [data_row['id'] for data_row in data_rows.result]
+
+    model_run.upsert_data_rows(data_row_ids)
+
+    for split in ["TRAINING", "TEST", "VALIDATION"]:
+        model_run.assign_data_rows_to_split(data_row_ids[:(n_data_rows // 2)],
+                                            split)
+        counts = Counter()
+        for data_row in model_run.model_run_data_rows():
+            counts[data_row.data_split] += 1
+        assert counts[split] == n_data_rows // 2
+
+    with pytest.raises(ValueError):
+        model_run.assign_data_rows_to_split(data_row_ids, "INVALID SPLIT")
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
@@ -47,7 +47,7 @@ def graphql_url(environ: str) -> str:
     if environ == Environ.PROD:
         return 'https://api.labelbox.com/graphql'
     elif environ == Environ.STAGING:
-        return 'https://staging-api.labelbox.com/graphql'
+        return 'https://api.lb-stage.xyz/graphql'
     elif environ == Environ.ONPREM:
         hostname = os.environ.get('LABELBOX_TEST_ONPREM_HOSTNAME', None)
         if hostname is None:
@@ -145,7 +145,10 @@ def client(environ: str):
 
 @pytest.fixture(scope="session")
 def image_url(client):
-    return client.upload_data(requests.get(IMG_URL).content, sign=True)
+    return client.upload_data(requests.get(IMG_URL).content,
+                              content_type="application/json",
+                              filename="json_import.json",
+                              sign=True)
 
 
 @pytest.fixture
@@ -181,7 +184,7 @@ def iframe_url(environ) -> str:
     if environ in [Environ.PROD, Environ.LOCAL]:
         return 'https://editor.labelbox.com'
     elif environ == Environ.STAGING:
-        return 'https://staging.labelbox.dev/editor'
+        return 'https://editor.lb-stage.xyz'
 
 
 @pytest.fixture
@@ -290,7 +293,7 @@ def configured_project_with_label(client, rand_gen, image_url, project, dataset,
 
     def create_label():
         """ Ad-hoc function to create a LabelImport
-        
+
         Creates a LabelImport task which will create a label
         """
         upload_task = LabelImport.create_from_objects(