Skip to content

Commit 3b5e5ad

Browse files
authored
[AL-5458] Add our own library to use Python json parser to parse ndjson (#1076)
1 parent 98759ab commit 3b5e5ad

File tree

3 files changed

+155
-0
lines changed

3 files changed

+155
-0
lines changed
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
from io import FileIO, StringIO
2+
import json
3+
from typing import Iterable, Union
4+
5+
6+
def loads(ndjson_string: str, **kwargs) -> list:
7+
# NOTE: the consequence of this line would be conversion of 'literal' line breaks to commas
8+
lines = ','.join(ndjson_string.splitlines())
9+
text = f"[{lines}]" # NOTE: this is a hack to make json.loads work for ndjson
10+
return json.loads(text, **kwargs)
11+
12+
13+
def dumps(obj: list, **kwargs) -> str:
14+
lines = map(lambda obj: json.dumps(obj, **kwargs), obj)
15+
return '\n'.join(lines)
16+
17+
18+
def reader(io_handle: Union[StringIO, FileIO, Iterable], **kwargs):
19+
for line in io_handle:
20+
yield json.loads(line, **kwargs)

tests/unit/conftest.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
import requests
2+
3+
import pytest
4+
5+
6+
@pytest.fixture
7+
def ndjson_content():
8+
line = """{"uuid": "9fd9a92e-2560-4e77-81d4-b2e955800092", "schemaId": "ckaeasyfk004y0y7wyye5epgu", "dataRow": {"id": "ck7kftpan8ir008910yf07r9c"}, "bbox": {"top": 48, "left": 58, "height": 865, "width": 1512}}
9+
{"uuid": "29b878f3-c2b4-4dbf-9f22-a795f0720125", "schemaId": "ckapgvrl7007q0y7ujkjkaaxt", "dataRow": {"id": "ck7kftpan8ir008910yf07r9c"}, "polygon": [{"x": 147.692, "y": 118.154}, {"x": 142.769, "y": 404.923}, {"x": 57.846, "y": 318.769}, {"x": 28.308, "y": 169.846}]}"""
10+
expected_objects = [{
11+
'uuid': '9fd9a92e-2560-4e77-81d4-b2e955800092',
12+
'schemaId': 'ckaeasyfk004y0y7wyye5epgu',
13+
'dataRow': {
14+
'id': 'ck7kftpan8ir008910yf07r9c'
15+
},
16+
'bbox': {
17+
'top': 48,
18+
'left': 58,
19+
'height': 865,
20+
'width': 1512
21+
}
22+
}, {
23+
'uuid':
24+
'29b878f3-c2b4-4dbf-9f22-a795f0720125',
25+
'schemaId':
26+
'ckapgvrl7007q0y7ujkjkaaxt',
27+
'dataRow': {
28+
'id': 'ck7kftpan8ir008910yf07r9c'
29+
},
30+
'polygon': [{
31+
'x': 147.692,
32+
'y': 118.154
33+
}, {
34+
'x': 142.769,
35+
'y': 404.923
36+
}, {
37+
'x': 57.846,
38+
'y': 318.769
39+
}, {
40+
'x': 28.308,
41+
'y': 169.846
42+
}]
43+
}]
44+
45+
return line, expected_objects
46+
47+
48+
@pytest.fixture
49+
def ndjson_content_with_nonascii_and_line_breaks():
50+
line = '{"id": "2489651127", "type": "PushEvent", "actor": {"id": 1459915, "login": "xtuaok", "gravatar_id": "", "url": "https://api.github.com/users/xtuaok", "avatar_url": "https://avatars.githubusercontent.com/u/1459915?"}, "repo": {"id": 6719841, "name": "xtuaok/twitter_track_following", "url": "https://api.github.com/repos/xtuaok/twitter_track_following"}, "payload": {"push_id": 536864008, "size": 1, "distinct_size": 1, "ref": "refs/heads/xtuaok", "head": "afb8afe306c7893d93d383a06e4d9df53b41bf47", "before": "4671b4868f1a060f2ed64d8268cd22d514a84e63", "commits": [{"sha": "afb8afe306c7893d93d383a06e4d9df53b41bf47", "author": {"email": "47cb89439b2d6961b59dff4298e837f67aa77389@gmail.com", "name": "Tomonori Tamagawa"}, "message": "Update ID 949438177,, - screen_name: chomado, - name: ちょまど@初詣おみくじ凶, - description: ( *゚▽゚* っ)З腐女子!絵描き!| H26新卒文系SE (入社して4ヶ月目の8月にSIer(適応障害になった)を辞職し開発者に転職) | H26秋応用情報合格!| 自作bot (in PHP) chomado_bot | プログラミングガチ初心者, - location:", "distinct": true, "url": "https://api.github.com/repos/xtuaok/twitter_track_following/commits/afb8afe306c7893d93d383a06e4d9df53b41bf47"}]}, "public": true, "created_at": "2015-01-01T15:00:10Z"}'
51+
expected_objects = [{
52+
'id': '2489651127',
53+
'type': 'PushEvent',
54+
'actor': {
55+
'id': 1459915,
56+
'login': 'xtuaok',
57+
'gravatar_id': '',
58+
'url': 'https://api.github.com/users/xtuaok',
59+
'avatar_url': 'https://avatars.githubusercontent.com/u/1459915?'
60+
},
61+
'repo': {
62+
'id': 6719841,
63+
'name': 'xtuaok/twitter_track_following',
64+
'url': 'https://api.github.com/repos/xtuaok/twitter_track_following'
65+
},
66+
'payload': {
67+
'push_id':
68+
536864008,
69+
'size':
70+
1,
71+
'distinct_size':
72+
1,
73+
'ref':
74+
'refs/heads/xtuaok',
75+
'head':
76+
'afb8afe306c7893d93d383a06e4d9df53b41bf47',
77+
'before':
78+
'4671b4868f1a060f2ed64d8268cd22d514a84e63',
79+
'commits': [{
80+
'sha':
81+
'afb8afe306c7893d93d383a06e4d9df53b41bf47',
82+
'author': {
83+
'email':
84+
'47cb89439b2d6961b59dff4298e837f67aa77389@gmail.com',
85+
'name':
86+
'Tomonori Tamagawa'
87+
},
88+
'message':
89+
'Update ID 949438177,, - screen_name: chomado, - name: ちょまど@初詣おみくじ凶, - description: ( *゚▽゚* っ)З腐女子!絵描き!| H26新卒文系SE (入社して4ヶ月目の8月にSIer(適応障害になった)を辞職し開発者に転職) | H26秋応用情報合格!| 自作bot (in PHP) chomado_bot | プログラミングガチ初心者, - location:',
90+
'distinct':
91+
True,
92+
'url':
93+
'https://api.github.com/repos/xtuaok/twitter_track_following/commits/afb8afe306c7893d93d383a06e4d9df53b41bf47'
94+
}]
95+
},
96+
'public': True,
97+
'created_at': '2015-01-01T15:00:10Z'
98+
}]
99+
return line, expected_objects

tests/unit/test_ndjson_parsing.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import ast
2+
import random
3+
import time
4+
from io import StringIO
5+
import ndjson
6+
from labelbox.data.serialization.ndjson import parser
7+
8+
9+
def test_loads(ndjson_content):
10+
expected_line, expected_objects = ndjson_content
11+
12+
parsed_line = parser.loads(expected_line)
13+
assert parsed_line == expected_objects
14+
assert parser.dumps(parsed_line) == expected_line
15+
16+
17+
def test_reader_stringio(ndjson_content):
18+
line, ndjson_objects = ndjson_content
19+
20+
text_io = StringIO(line)
21+
parsed_arr = []
22+
reader = parser.reader(text_io)
23+
for _, r in enumerate(reader):
24+
parsed_arr.append(r)
25+
assert parsed_arr == ndjson_objects
26+
27+
28+
def test_non_ascii_new_line(ndjson_content_with_nonascii_and_line_breaks):
29+
line, expected_objects = ndjson_content_with_nonascii_and_line_breaks
30+
parsed = parser.loads(line)
31+
32+
assert parsed == expected_objects
33+
34+
# NOTE: json parser converts unicode chars to unicode literals by default and this is a good practice
35+
# but it is not what we want here since we want to compare the strings with actual unicode chars
36+
assert ast.literal_eval("'" + parser.dumps(parsed) + "'") == line

0 commit comments

Comments
 (0)