ability to use ndjson converter with pdf and test updates

jtsodapop · jtsodapop · commit acdd0b34f1ab · 2022-08-19T12:44:00.000-04:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,10 @@
     * Resets model run training metadata
 * `ModelRun.get_config()`
     * Fetches model run training metadata
+* Support for document (pdf) de/serialization from exports
+    * Use the `LBV1Converter.serialize()` and `LBV1Converter.deserialize()` methods
+* Support for document (pdf) de/serialization for imports
+    * Use the `NDJsonConverter.serialize()` and `NDJsonConverter.deserialize()` methods
     
 ### Changed
 * `Model.create_model_run()`
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.7
+FROM python:3.8
 
 RUN pip install pytest pytest-cases pytest-rerunfailures
 RUN apt-get -y update
diff --git a/labelbox/data/serialization/ndjson/objects.py b/labelbox/data/serialization/ndjson/objects.py
@@ -1,6 +1,6 @@
 from ast import Bytes
 from io import BytesIO
-from typing import Any, Dict, List, Tuple, Union
+from typing import Any, Dict, List, Tuple, Union, Optional
 import base64
 import numpy as np
 
@@ -21,6 +21,8 @@
 
 class NDBaseObject(NDAnnotation):
     classifications: List[NDSubclassificationType] = []
+    page: Optional[int] = None
+    unit: Optional[str] = None
 
 
 class VideoSupported(BaseModel):
@@ -167,7 +169,9 @@ def from_common(cls, rectangle: Rectangle,
                    name=name,
                    schema_id=feature_schema_id,
                    uuid=extra.get('uuid'),
-                   classifications=classifications)
+                   classifications=classifications,
+                   page=extra.get('page'),
+                   unit=extra.get('unit'))
 
 
 class NDFrameRectangle(VideoSupported):
@@ -352,7 +356,11 @@ def to_common(annotation: "NDObjectType") -> ObjectAnnotation:
                                 name=annotation.name,
                                 feature_schema_id=annotation.schema_id,
                                 classifications=classifications,
-                                extra={'uuid': annotation.uuid})
+                                extra={
+                                    'uuid': annotation.uuid,
+                                    'page': annotation.page,
+                                    'unit': annotation.unit
+                                })
 
     @classmethod
     def from_common(
diff --git a/tests/data/assets/ndjson/pdf_import.json b/tests/data/assets/ndjson/pdf_import.json
@@ -0,0 +1,97 @@
+[{
+    "uuid": "5ad9c52f-058d-49c8-a749-3f20b84f8cd4",
+    "dataRow": {
+        "id": "cl6xnv9h61fv0085yhtoq06ht"
+    },
+    "name": "boxy",
+    "schemaId": "cl6xnuwt95lqq07330tbb3mfd",
+    "classifications": [],
+    "page": 4,
+    "unit": "POINTS",
+    "bbox": {
+        "top": 162.73,
+        "left": 32.45,
+        "height": 388.16999999999996,
+        "width": 101.66000000000001
+    }
+}, {
+    "uuid": "20eeef88-0294-49b4-a815-86588476bc6f",
+    "dataRow": {
+        "id": "cl6xnv9h61fv0085yhtoq06ht"
+    },
+    "name": "boxy",
+    "schemaId": "cl6xnuwt95lqq07330tbb3mfd",
+    "classifications": [],
+    "page": 7,
+    "unit": "POINTS",
+    "bbox": {
+        "top": 223.26,
+        "left": 251.42,
+        "height": 457.03999999999996,
+        "width": 186.78
+    }
+}, {
+    "uuid": "641a8944-3938-409c-b4eb-dea354ed06e5",
+    "dataRow": {
+        "id": "cl6xnv9h61fv0085yhtoq06ht"
+    },
+    "name": "boxy",
+    "schemaId": "cl6xnuwt95lqq07330tbb3mfd",
+    "classifications": [],
+    "page": 6,
+    "unit": "POINTS",
+    "bbox": {
+        "top": 32.52,
+        "left": 218.17,
+        "height": 231.73,
+        "width": 110.56000000000003
+    }
+}, {
+    "uuid": "ebe4da7d-08b3-480a-8d15-26552b7f011c",
+    "dataRow": {
+        "id": "cl6xnv9h61fv0085yhtoq06ht"
+    },
+    "name": "boxy",
+    "schemaId": "cl6xnuwt95lqq07330tbb3mfd",
+    "classifications": [],
+    "page": 7,
+    "unit": "POINTS",
+    "bbox": {
+        "top": 117.39,
+        "left": 4.25,
+        "height": 456.9200000000001,
+        "width": 164.83
+    }
+}, {
+    "uuid": "35c41855-575f-42cc-a2f9-1f06237e9b63",
+    "dataRow": {
+        "id": "cl6xnv9h61fv0085yhtoq06ht"
+    },
+    "name": "boxy",
+    "schemaId": "cl6xnuwt95lqq07330tbb3mfd",
+    "classifications": [],
+    "page": 8,
+    "unit": "POINTS",
+    "bbox": {
+        "top": 82.13,
+        "left": 217.28,
+        "height": 279.76,
+        "width": 82.43000000000004
+    }
+}, {
+    "uuid": "1b009654-bc17-42a2-8a71-160e7808c403",
+    "dataRow": {
+        "id": "cl6xnv9h61fv0085yhtoq06ht"
+    },
+    "name": "boxy",
+    "schemaId": "cl6xnuwt95lqq07330tbb3mfd",
+    "classifications": [],
+    "page": 3,
+    "unit": "POINTS",
+    "bbox": {
+        "top": 298.12,
+        "left": 83.34,
+        "height": 203.83000000000004,
+        "width": 0.37999999999999545
+    }
+}]
diff --git a/tests/data/assets/ndjson/pdf_import_name_only.json b/tests/data/assets/ndjson/pdf_import_name_only.json
@@ -0,0 +1,91 @@
+[{
+    "uuid": "5ad9c52f-058d-49c8-a749-3f20b84f8cd4",
+    "dataRow": {
+        "id": "cl6xnv9h61fv0085yhtoq06ht"
+    },
+    "name": "boxy",
+    "classifications": [],
+    "page": 4,
+    "unit": "POINTS",
+    "bbox": {
+        "top": 162.73,
+        "left": 32.45,
+        "height": 388.16999999999996,
+        "width": 101.66000000000001
+    }
+}, {
+    "uuid": "20eeef88-0294-49b4-a815-86588476bc6f",
+    "dataRow": {
+        "id": "cl6xnv9h61fv0085yhtoq06ht"
+    },
+    "name": "boxy",
+    "classifications": [],
+    "page": 7,
+    "unit": "POINTS",
+    "bbox": {
+        "top": 223.26,
+        "left": 251.42,
+        "height": 457.03999999999996,
+        "width": 186.78
+    }
+}, {
+    "uuid": "641a8944-3938-409c-b4eb-dea354ed06e5",
+    "dataRow": {
+        "id": "cl6xnv9h61fv0085yhtoq06ht"
+    },
+    "name": "boxy",
+    "classifications": [],
+    "page": 6,
+    "unit": "POINTS",
+    "bbox": {
+        "top": 32.52,
+        "left": 218.17,
+        "height": 231.73,
+        "width": 110.56000000000003
+    }
+}, {
+    "uuid": "ebe4da7d-08b3-480a-8d15-26552b7f011c",
+    "dataRow": {
+        "id": "cl6xnv9h61fv0085yhtoq06ht"
+    },
+    "name": "boxy",
+    "classifications": [],
+    "page": 7,
+    "unit": "POINTS",
+    "bbox": {
+        "top": 117.39,
+        "left": 4.25,
+        "height": 456.9200000000001,
+        "width": 164.83
+    }
+}, {
+    "uuid": "35c41855-575f-42cc-a2f9-1f06237e9b63",
+    "dataRow": {
+        "id": "cl6xnv9h61fv0085yhtoq06ht"
+    },
+    "name": "boxy",
+    "classifications": [],
+    "page": 8,
+    "unit": "POINTS",
+    "bbox": {
+        "top": 82.13,
+        "left": 217.28,
+        "height": 279.76,
+        "width": 82.43000000000004
+    }
+}, {
+    "uuid": "1b009654-bc17-42a2-8a71-160e7808c403",
+    "dataRow": {
+        "id": "cl6xnv9h61fv0085yhtoq06ht"
+    },
+    "name": "boxy",
+    "classifications": [],
+    "page": 3,
+    "unit": "POINTS",
+    "bbox": {
+        "top": 298.12,
+        "left": 83.34,
+        "height": 203.83000000000004,
+        "width": 0.37999999999999545
+    }
+}]
diff --git a/tests/data/serialization/ndjson/test_classification.py b/tests/data/serialization/ndjson/test_classification.py
@@ -4,13 +4,18 @@
 from labelbox.data.serialization.ndjson.converter import NDJsonConverter
 from labelbox.data.serialization.ndjson.objects import NDLine
 
+IGNORE_KEYS = ['unit', 'page']
+
 
 def test_classification():
     with open('tests/data/assets/ndjson/classification_import.json',
               'r') as file:
         data = json.load(file)
     res = NDJsonConverter.deserialize(data).as_list()
     res = list(NDJsonConverter.serialize(res))
+    for r in res:
+        for key in IGNORE_KEYS:
+            r.pop(key, None)
     assert res == data
 
 
@@ -20,4 +25,7 @@ def test_classification_with_name():
         data = json.load(file)
     res = NDJsonConverter.deserialize(data).as_list()
     res = list(NDJsonConverter.serialize(res))
+    for r in res:
+        for key in IGNORE_KEYS:
+            r.pop(key, None)
     assert res == data
diff --git a/tests/data/serialization/ndjson/test_document.py b/tests/data/serialization/ndjson/test_document.py
@@ -0,0 +1,40 @@
+import json
+
+from labelbox.data.serialization.ndjson.converter import NDJsonConverter
+
+
+def round_dict(data):
+    if isinstance(data, dict):
+        for key in data:
+            if isinstance(data[key], float):
+                data[key] = int(data[key])
+            elif isinstance(data[key], dict):
+                data[key] = round_dict(data[key])
+            elif isinstance(data[key], (list, tuple)):
+                data[key] = [round_dict(r) for r in data[key]]
+
+    return data
+
+
+def test_pdf():
+    """
+    Tests a pdf file with bbox annotations only
+    """
+    with open('tests/data/assets/ndjson/pdf_import.json', 'r') as f:
+        data = json.load(f)
+    res = NDJsonConverter.deserialize(data).as_list()
+    res = list(NDJsonConverter.serialize(res))
+    assert [round_dict(x) for x in res] == [round_dict(x) for x in data]
+    f.close()
+
+
+def test_pdf_with_name_only():
+    """
+    Tests a pdf file with bbox annotations only
+    """
+    with open('tests/data/assets/ndjson/pdf_import_name_only.json', 'r') as f:
+        data = json.load(f)
+    res = NDJsonConverter.deserialize(data).as_list()
+    res = list(NDJsonConverter.serialize(res))
+    assert [round_dict(x) for x in res] == [round_dict(x) for x in data]
+    f.close()
diff --git a/tests/data/serialization/ndjson/test_export_video_objects.py b/tests/data/serialization/ndjson/test_export_video_objects.py
@@ -585,6 +585,10 @@ def video_serialized_bbox_label():
     }
 
 
+#ignore uuid because we randomize if there was none
+IGNORE_KEYS = ["uuid", "page", "unit"]
+
+
 def test_serialize_video_objects():
     label = video_bbox_label()
     serialized_labels = NDJsonConverter.serialize([label])
@@ -593,8 +597,7 @@ def test_serialize_video_objects():
     manual_label = video_serialized_bbox_label()
 
     for key in label.keys():
-        #ignore uuid because we randomize if there was none
-        if key != "uuid":
+        if key not in IGNORE_KEYS:
             assert label[key] == manual_label[key]
 
     assert len(label['segments']) == 2
diff --git a/tests/data/serialization/ndjson/test_image.py b/tests/data/serialization/ndjson/test_image.py
@@ -5,6 +5,8 @@
 from labelbox.data.serialization.ndjson.converter import NDJsonConverter
 from labelbox.data.annotation_types import Mask, Label, ObjectAnnotation, ImageData, MaskData
 
+IGNORE_KEYS = ['classifications', 'unit', 'page']
+
 
 def round_dict(data):
     if isinstance(data, dict):
@@ -26,7 +28,8 @@ def test_image():
     res = NDJsonConverter.deserialize(data).as_list()
     res = list(NDJsonConverter.serialize(res))
     for r in res:
-        r.pop('classifications', None)
+        for key in IGNORE_KEYS:
+            r.pop(key, None)
     assert [round_dict(x) for x in res] == [round_dict(x) for x in data]
 
 
@@ -38,7 +41,8 @@ def test_image_with_name_only():
     res = NDJsonConverter.deserialize(data).as_list()
     res = list(NDJsonConverter.serialize(res))
     for r in res:
-        r.pop('classifications', None)
+        for key in IGNORE_KEYS:
+            r.pop(key, None)
     assert [round_dict(x) for x in res] == [round_dict(x) for x in data]
 
 
@@ -68,7 +72,8 @@ def test_mask():
     res = NDJsonConverter.deserialize(data).as_list()
     res = list(NDJsonConverter.serialize(res))
     for r in res:
-        r.pop('classifications', None)
+        for key in IGNORE_KEYS:
+            r.pop(key, None)
 
     assert [round_dict(x) for x in res] == [round_dict(x) for x in data]
 
@@ -94,5 +99,7 @@ def test_mask_from_arr():
         "mask": {
             "png":
                 "iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAAAAABWESUoAAAAHklEQVR4nGNgGAKAEYn8j00BEyETBoOCUTAKhhwAAJW+AQwvpePVAAAAAElFTkSuQmCC"
-        }
+        },
+        "page": None,
+        "unit": None
     }
diff --git a/tests/data/serialization/ndjson/test_nested.py b/tests/data/serialization/ndjson/test_nested.py
diff --git a/tests/data/serialization/ndjson/test_text.py b/tests/data/serialization/ndjson/test_text.py
diff --git a/tests/data/serialization/ndjson/test_video.py b/tests/data/serialization/ndjson/test_video.py

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-FROM python:3.7`
	`1`	`+FROM python:3.8`
`2`	`2`
`3`	`3`	`RUN pip install pytest pytest-cases pytest-rerunfailures`
`4`	`4`	`RUN apt-get -y update`