[AL-5458] Add our own library to use Python json parser to parse ndjson (#1076)

vbrodsky · web-flow · commit 3b5e5ad4c55f · 2023-05-11T11:22:25.000-07:00
diff --git a/labelbox/data/serialization/ndjson/parser.py b/labelbox/data/serialization/ndjson/parser.py
@@ -0,0 +1,20 @@
+from io import FileIO, StringIO
+import json
+from typing import Iterable, Union
+
+
+def loads(ndjson_string: str, **kwargs) -> list:
+    # NOTE: the consequence of this line would be conversion of 'literal' line breaks to commas
+    lines = ','.join(ndjson_string.splitlines())
+    text = f"[{lines}]"  # NOTE: this is a hack to make json.loads work for ndjson
+    return json.loads(text, **kwargs)
+
+
+def dumps(obj: list, **kwargs) -> str:
+    lines = map(lambda obj: json.dumps(obj, **kwargs), obj)
+    return '\n'.join(lines)
+
+
+def reader(io_handle: Union[StringIO, FileIO, Iterable], **kwargs):
+    for line in io_handle:
+        yield json.loads(line, **kwargs)
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
@@ -0,0 +1,99 @@
+import requests
+
+import pytest
+
+
+@pytest.fixture
+def ndjson_content():
+    line = """{"uuid": "9fd9a92e-2560-4e77-81d4-b2e955800092", "schemaId": "ckaeasyfk004y0y7wyye5epgu", "dataRow": {"id": "ck7kftpan8ir008910yf07r9c"}, "bbox": {"top": 48, "left": 58, "height": 865, "width": 1512}}
+{"uuid": "29b878f3-c2b4-4dbf-9f22-a795f0720125", "schemaId": "ckapgvrl7007q0y7ujkjkaaxt", "dataRow": {"id": "ck7kftpan8ir008910yf07r9c"}, "polygon": [{"x": 147.692, "y": 118.154}, {"x": 142.769, "y": 404.923}, {"x": 57.846, "y": 318.769}, {"x": 28.308, "y": 169.846}]}"""
+    expected_objects = [{
+        'uuid': '9fd9a92e-2560-4e77-81d4-b2e955800092',
+        'schemaId': 'ckaeasyfk004y0y7wyye5epgu',
+        'dataRow': {
+            'id': 'ck7kftpan8ir008910yf07r9c'
+        },
+        'bbox': {
+            'top': 48,
+            'left': 58,
+            'height': 865,
+            'width': 1512
+        }
+    }, {
+        'uuid':
+            '29b878f3-c2b4-4dbf-9f22-a795f0720125',
+        'schemaId':
+            'ckapgvrl7007q0y7ujkjkaaxt',
+        'dataRow': {
+            'id': 'ck7kftpan8ir008910yf07r9c'
+        },
+        'polygon': [{
+            'x': 147.692,
+            'y': 118.154
+        }, {
+            'x': 142.769,
+            'y': 404.923
+        }, {
+            'x': 57.846,
+            'y': 318.769
+        }, {
+            'x': 28.308,
+            'y': 169.846
+        }]
+    }]
+
+    return line, expected_objects
+
+
+@pytest.fixture
+def ndjson_content_with_nonascii_and_line_breaks():
+    line = '{"id": "2489651127", "type": "PushEvent", "actor": {"id": 1459915, "login": "xtuaok", "gravatar_id": "", "url": "https://api.github.com/users/xtuaok", "avatar_url": "https://avatars.githubusercontent.com/u/1459915?"}, "repo": {"id": 6719841, "name": "xtuaok/twitter_track_following", "url": "https://api.github.com/repos/xtuaok/twitter_track_following"}, "payload": {"push_id": 536864008, "size": 1, "distinct_size": 1, "ref": "refs/heads/xtuaok", "head": "afb8afe306c7893d93d383a06e4d9df53b41bf47", "before": "4671b4868f1a060f2ed64d8268cd22d514a84e63", "commits": [{"sha": "afb8afe306c7893d93d383a06e4d9df53b41bf47", "author": {"email": "47cb89439b2d6961b59dff4298e837f67aa77389@gmail.com", "name": "Tomonori Tamagawa"}, "message": "Update ID 949438177,, - screen_name: chomado, - name: ちょまど@初詣おみくじ凶, - description: ( *ﾟ▽ﾟ* っ)З腐女子！絵描き！| H26新卒文系SE (入社して4ヶ月目の8月にSIer(適応障害になった)を辞職し開発者に転職) | H26秋応用情報合格！| 自作bot (in PHP) chomado_bot | プログラミングガチ初心者, - location:", "distinct": true, "url": "https://api.github.com/repos/xtuaok/twitter_track_following/commits/afb8afe306c7893d93d383a06e4d9df53b41bf47"}]}, "public": true, "created_at": "2015-01-01T15:00:10Z"}'
+    expected_objects = [{
+        'id': '2489651127',
+        'type': 'PushEvent',
+        'actor': {
+            'id': 1459915,
+            'login': 'xtuaok',
+            'gravatar_id': '',
+            'url': 'https://api.github.com/users/xtuaok',
+            'avatar_url': 'https://avatars.githubusercontent.com/u/1459915?'
+        },
+        'repo': {
+            'id': 6719841,
+            'name': 'xtuaok/twitter_track_following',
+            'url': 'https://api.github.com/repos/xtuaok/twitter_track_following'
+        },
+        'payload': {
+            'push_id':
+                536864008,
+            'size':
+                1,
+            'distinct_size':
+                1,
+            'ref':
+                'refs/heads/xtuaok',
+            'head':
+                'afb8afe306c7893d93d383a06e4d9df53b41bf47',
+            'before':
+                '4671b4868f1a060f2ed64d8268cd22d514a84e63',
+            'commits': [{
+                'sha':
+                    'afb8afe306c7893d93d383a06e4d9df53b41bf47',
+                'author': {
+                    'email':
+                        '47cb89439b2d6961b59dff4298e837f67aa77389@gmail.com',
+                    'name':
+                        'Tomonori Tamagawa'
+                },
+                'message':
+                    'Update ID 949438177,, - screen_name: chomado, - name: ちょまど@初詣おみくじ凶, - description: ( *ﾟ▽ﾟ* っ)З腐女子！絵描き！| H26新卒文系SE (入社して4ヶ月目の8月にSIer(適応障害になった)を辞職し開発者に転職) | H26秋応用情報合格！| 自作bot (in PHP) chomado_bot | プログラミングガチ初心者, - location:',
+                'distinct':
+                    True,
+                'url':
+                    'https://api.github.com/repos/xtuaok/twitter_track_following/commits/afb8afe306c7893d93d383a06e4d9df53b41bf47'
+            }]
+        },
+        'public': True,
+        'created_at': '2015-01-01T15:00:10Z'
+    }]
+    return line, expected_objects
diff --git a/tests/unit/test_ndjson_parsing.py b/tests/unit/test_ndjson_parsing.py
@@ -0,0 +1,36 @@
+import ast
+import random
+import time
+from io import StringIO
+import ndjson
+from labelbox.data.serialization.ndjson import parser
+
+
+def test_loads(ndjson_content):
+    expected_line, expected_objects = ndjson_content
+
+    parsed_line = parser.loads(expected_line)
+    assert parsed_line == expected_objects
+    assert parser.dumps(parsed_line) == expected_line
+
+
+def test_reader_stringio(ndjson_content):
+    line, ndjson_objects = ndjson_content
+
+    text_io = StringIO(line)
+    parsed_arr = []
+    reader = parser.reader(text_io)
+    for _, r in enumerate(reader):
+        parsed_arr.append(r)
+    assert parsed_arr == ndjson_objects
+
+
+def test_non_ascii_new_line(ndjson_content_with_nonascii_and_line_breaks):
+    line, expected_objects = ndjson_content_with_nonascii_and_line_breaks
+    parsed = parser.loads(line)
+
+    assert parsed == expected_objects
+
+    # NOTE: json parser converts unicode chars to unicode literals by default and this is a good practice
+    #   but it is not what we want here since we want to compare the strings with actual unicode chars
+    assert ast.literal_eval("'" + parser.dumps(parsed) + "'") == line