Skip to content

Commit 67cdafa

Browse files
authored
Merge pull request #13 from dannyhann/feature/issue-10
Adding Strict Parsing Mode and RemainChunkTransaction
2 parents 5266671 + ae1f19f commit 67cdafa

File tree

4 files changed

+192
-18
lines changed

4 files changed

+192
-18
lines changed

.coveragerc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
omit =
33
venv/*
44
bin/*
5+
build/*
6+
dist/*
57
.pytest_cache/*
68
LICENSE
79
README.md

coverage.svg

Lines changed: 2 additions & 2 deletions
Loading

protobuf_decoder/protobuf_decoder.py

Lines changed: 114 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -116,19 +116,28 @@ def to_dict(self):
116116
@dataclass
117117
class ParsedResults:
118118
results: List[ParsedResult]
119+
remain_data: str = None
119120

120121
@property
121122
def has_results(self):
122123
return len(self.results) > 0
123124

125+
@property
126+
def has_remain_data(self):
127+
return self.remain_data is not None
128+
124129
def __getitem__(self, item):
125130
return self.results[item]
126131

127132
def to_dict(self):
128133
results = [result.to_dict() for result in self.results]
129-
return dict(
134+
dict_results = dict(
130135
results=results,
131136
)
137+
if self.has_remain_data:
138+
dict_results["remain_data"] = self.remain_data
139+
140+
return dict_results
132141

133142

134143
class State(Enum):
@@ -197,6 +206,44 @@ def hex_string_to_utf8(cls, string) -> str:
197206
string = string.replace(" ", "")
198207
return binascii.unhexlify(string).decode("utf-8")
199208

209+
@classmethod
210+
def chunk_to_hex_string(cls, chunk) -> str:
211+
return hex(chunk)[2:].zfill(2)
212+
213+
@classmethod
214+
def change_endian(cls, string) -> str:
215+
is_valid, valid_string = cls.validate(string)
216+
if not is_valid:
217+
raise ValueError("Invalid hex format")
218+
219+
_output = []
220+
221+
_chunk_buffer = []
222+
for chunk in cls.get_chunked_list(valid_string):
223+
_chunk_buffer.append(chunk)
224+
if len(_chunk_buffer) == 2:
225+
_chunk_buffer.reverse()
226+
for _chunk in _chunk_buffer:
227+
_output.append(_chunk)
228+
_chunk_buffer = []
229+
230+
for _chunk in _chunk_buffer:
231+
_output.append(_chunk)
232+
233+
return " ".join(_output)
234+
235+
@classmethod
236+
def show_parsed_results(cls, parsed_results: ParsedResults, depth=0, print_func=print):
237+
if parsed_results.has_results:
238+
for result in parsed_results.results:
239+
if isinstance(result.data, ParsedResults):
240+
print_func("\t" * depth, f"[{result.field}: {result.wire_type}] =>")
241+
cls.show_parsed_results(result.data, depth + 1)
242+
else:
243+
print_func("\t" * depth, f"[{result.field}: {result.wire_type}] => {result.data}")
244+
if parsed_results.has_remain_data:
245+
print_func("\t" * depth, f"left over bytes: {parsed_results.remain_data}")
246+
200247

201248
class BytesBuffer:
202249
def __init__(self):
@@ -254,17 +301,57 @@ def fetch_32bits(self):
254301
self.set_data_length(4 + 1)
255302

256303

304+
class RemainChunkTransaction:
305+
def __init__(self):
306+
self._is_done = True
307+
self._remain_hex_string_list = []
308+
309+
def consume_chunk(self, chunk):
310+
self._remain_hex_string_list.append(
311+
Utils.chunk_to_hex_string(chunk)
312+
)
313+
314+
def flush_chunk(self):
315+
self._remain_hex_string_list = []
316+
317+
def start(self):
318+
self._is_done = False
319+
320+
def done(self):
321+
self._is_done = True
322+
self.flush_chunk()
323+
324+
@property
325+
def is_done(self):
326+
return self._is_done
327+
328+
@property
329+
def remain_hex_string_list(self):
330+
return self._remain_hex_string_list
331+
332+
@property
333+
def remain_hex_string(self):
334+
return " ".join(self._remain_hex_string_list)
335+
336+
@property
337+
def has_remain_data(self):
338+
return len(self._remain_hex_string_list) > 0
339+
340+
257341
class Parser:
258-
def __init__(self, nexted_depth: int = 0):
342+
def __init__(self, nexted_depth: int = 0, strict: bool = False):
259343
self._nested_depth = nexted_depth
260344
self._buffer = BytesBuffer()
261345
self._fetcher = Fetcher()
262346
self._target_field = None
263347
self._parsed_data: List[ParsedResult] = []
264348
self._state = State.FIND_FIELD
349+
self._is_strict = strict
350+
351+
self._t = RemainChunkTransaction()
265352

266353
def _create_nested_parser(self) -> Parser:
267-
return Parser(nexted_depth=self._nested_depth + 1)
354+
return Parser(nexted_depth=self._nested_depth + 1, strict=self._is_strict)
268355

269356
@staticmethod
270357
def _has_next(chunk_bytes) -> bool:
@@ -294,6 +381,8 @@ def _handler_find_field(self, chunk):
294381
if self._has_next(chunk):
295382
return self._next_buffer_handler(value)
296383

384+
self._t.start()
385+
297386
self._buffer.append(value)
298387
bit_value = self._get_buffered_value()
299388
wire_type, field = self._parse_wire_type(bit_value)
@@ -314,7 +403,10 @@ def _handler_find_field(self, chunk):
314403
elif wire_type == WireType.EGROUP.value:
315404
self._state = State.TERMINATED
316405
else:
406+
if self._is_strict:
407+
raise AssertionError(f"Invalid wire_type: {wire_type}")
317408
self._state = State.TERMINATED
409+
318410
self._buffer.flush()
319411

320412
def _parse_varint_handler(self, chunk):
@@ -334,6 +426,7 @@ def _parse_varint_handler(self, chunk):
334426

335427
self._state = State.FIND_FIELD
336428
self._buffer.flush()
429+
self._t.done()
337430

338431
def _parse_fixed_handler(self, chunk):
339432
self._next_buffer_handler(chunk)
@@ -354,6 +447,7 @@ def _parse_fixed_handler(self, chunk):
354447
self._state = State.FIND_FIELD
355448
self._buffer.flush()
356449
self._fetcher.seek()
450+
self._t.done()
357451

358452
def _zero_length_delimited_handler(self):
359453
self._parsed_data.append(
@@ -365,6 +459,7 @@ def _zero_length_delimited_handler(self):
365459
)
366460
self._state = State.FIND_FIELD
367461
self._buffer.flush()
462+
self._t.done()
368463

369464
def _parse_length_delimited_handler(self, chunk):
370465
value = self._get_value(chunk)
@@ -379,6 +474,7 @@ def _parse_length_delimited_handler(self, chunk):
379474
self._fetcher.set_data_length(data_length)
380475
self._state = State.GET_DELIMITED_DATA
381476
self._buffer.flush()
477+
self._t.done()
382478

383479
def _next_get_delimited_data_handler(self, value):
384480
self._fetcher.fetch()
@@ -439,9 +535,16 @@ def _get_delimited_data_handler(self, chunk):
439535
self._buffer.flush()
440536
self._fetcher.seek()
441537
self._state = State.FIND_FIELD
538+
self._t.done()
442539

443540
def _create_parsed_results(self) -> ParsedResults:
444-
return ParsedResults(results=self._parsed_data)
541+
if not self._t.has_remain_data:
542+
return ParsedResults(results=self._parsed_data)
543+
544+
return ParsedResults(
545+
results=self._parsed_data,
546+
remain_data=self._t.remain_hex_string
547+
)
445548

446549
def parse(self, test_target) -> ParsedResults:
447550
if test_target == "":
@@ -454,6 +557,8 @@ def parse(self, test_target) -> ParsedResults:
454557
for hex_chunk in Utils.get_chunked_list(validate_string):
455558
chunk = Utils.hex_string_to_decimal(hex_chunk)
456559

560+
self._t.consume_chunk(chunk)
561+
457562
if self._state == State.FIND_FIELD:
458563
self._handler_find_field(chunk)
459564

@@ -478,8 +583,12 @@ def parse(self, test_target) -> ParsedResults:
478583
continue
479584

480585
elif self._state == State.TERMINATED:
481-
return self._create_parsed_results()
586+
pass
587+
482588
else:
483589
raise ValueError(f"Unsupported State {self._state}")
484590

591+
if self._is_strict:
592+
assert self._t.is_done, "parsing process is not done, Maybe invalid protobuf"
593+
485594
return self._create_parsed_results()

tests.py

Lines changed: 74 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,6 @@
33
from protobuf_decoder.protobuf_decoder import Utils, Parser, ParsedResult, ParsedResults, FixedBitsValue
44

55

6-
def show_parsed_results(parsed_results: ParsedResults, depth=0):
7-
if parsed_results.has_results:
8-
for result in parsed_results.results:
9-
if isinstance(result.data, ParsedResults):
10-
print("\t" * depth, f"[{result.field}: {result.wire_type}] =>")
11-
show_parsed_results(result.data, depth + 1)
12-
else:
13-
print("\t" * depth, f"[{result.field}: {result.wire_type}] => {result.data}")
14-
15-
166
def test_binary_validate():
177
is_valid, _ = Utils.validate("08 12")
188
assert is_valid is True
@@ -27,6 +17,16 @@ def test_binary_validate():
2717
assert is_valid is False
2818

2919

20+
def test_change_endian():
21+
try:
22+
Utils.change_endian("081H")
23+
except ValueError:
24+
assert True
25+
assert Utils.change_endian("0812") == "12 08"
26+
assert Utils.change_endian("08 12 12") == "12 08 12"
27+
assert Utils.change_endian("08 1212 32") == "12 08 32 12"
28+
29+
3030
def test_get_chunked_list():
3131
_, validate_string = Utils.validate("08 12")
3232
chunked_list = Utils.get_chunked_list(validate_string)
@@ -191,8 +191,12 @@ def test_parser6():
191191
"""
192192
test_target = " ".join(['ed', '85', '8c', 'ec', '8a', 'a4', 'ed', '8a', 'b8'])
193193
parsed_data = Parser().parse(test_target)
194+
194195
assert parsed_data.has_results is False
195-
assert parsed_data.to_dict() == {'results': []}
196+
197+
assert parsed_data.to_dict() == {'results': [], 'remain_data': 'ed 85 8c ec 8a a4 ed 8a b8', }
198+
assert parsed_data.has_remain_data is True
199+
assert parsed_data.remain_data == "ed 85 8c ec 8a a4 ed 8a b8"
196200

197201

198202
def test_parser7():
@@ -234,6 +238,7 @@ def test_parser8():
234238
235239
# binary
236240
0A 04 74 65 73 74 0A 05 74 65 73 74 32
241+
0A 04 74 65 73 74 0A 05 74 65 73 74 32 12 01 61 12 01 62
237242
238243
"""
239244
test_target = "0A 04 74 65 73 74 0A 05 74 65 73 74 32"
@@ -1905,3 +1910,61 @@ def test_inner_protobuf_8():
19051910
ParsedResult(field=2, wire_type='varint',
19061911
data=1)])),
19071912
ParsedResult(field=12, wire_type='varint', data=0)]))])
1913+
1914+
1915+
def test_strict_protobuf_1():
1916+
test_target = "02 04 74 65 73 74 02 05 74 65 73 74 32 00 00 00 00 0d 1d"
1917+
try:
1918+
Parser(strict=True).parse(test_target)
1919+
except AssertionError as error:
1920+
assert "parsing process is not done" in str(error)
1921+
else:
1922+
assert False
1923+
1924+
1925+
def test_strict_protobuf_2():
1926+
test_target = "800000000f677270632d7374617475733a300d"
1927+
try:
1928+
Parser(strict=True).parse(test_target)
1929+
except AssertionError as error:
1930+
assert "Invalid wire_type: 7" in str(error)
1931+
else:
1932+
assert False
1933+
1934+
1935+
def test_remain_protobuf_1():
1936+
test_target = "02 04 74 65 73 74 02 05 74 65 73 74 32 00 00 00 00 0d 1d"
1937+
parsed_data = Parser().parse(test_target)
1938+
assert parsed_data.has_remain_data
1939+
assert parsed_data.remain_data == '0d 1d'
1940+
assert parsed_data == ParsedResults(
1941+
results=[
1942+
ParsedResult(field=0, wire_type='string', data='test'),
1943+
ParsedResult(field=0, wire_type='string', data='test2'),
1944+
ParsedResult(field=0, wire_type='varint', data=0),
1945+
ParsedResult(field=0, wire_type='varint', data=0)
1946+
], remain_data='0d 1d'
1947+
)
1948+
assert parsed_data.to_dict() == {'remain_data': '0d 1d',
1949+
'results': [{'data': 'test', 'field': 0, 'wire_type': 'string'},
1950+
{'data': 'test2', 'field': 0, 'wire_type': 'string'},
1951+
{'data': 0, 'field': 0, 'wire_type': 'varint'},
1952+
{'data': 0, 'field': 0, 'wire_type': 'varint'}]}
1953+
1954+
1955+
def test_remain_protobuf_2():
1956+
test_target = "800000000f677270632d7374617475733a300d"
1957+
parsed_data = Parser(strict=False).parse(test_target)
1958+
assert parsed_data.has_remain_data
1959+
assert parsed_data.remain_data == '67 72 70 63 2d 73 74 61 74 75 73 3a 30 0d'
1960+
1961+
assert parsed_data == ParsedResults(
1962+
results=[
1963+
ParsedResult(field=0, wire_type='varint', data=0),
1964+
ParsedResult(field=0, wire_type='varint', data=15)
1965+
],
1966+
remain_data='67 72 70 63 2d 73 74 61 74 75 73 3a 30 0d'
1967+
)
1968+
assert parsed_data.to_dict() == {'remain_data': '67 72 70 63 2d 73 74 61 74 75 73 3a 30 0d',
1969+
'results': [{'data': 0, 'field': 0, 'wire_type': 'varint'},
1970+
{'data': 15, 'field': 0, 'wire_type': 'varint'}]}

0 commit comments

Comments
 (0)