Skip to content

Commit e80b3d4

Browse files
committed
hexdiff: 2 algorithms, doc
1 parent cb6dcf4 commit e80b3d4

File tree

2 files changed

+102
-36
lines changed

2 files changed

+102
-36
lines changed

scapy/utils.py

Lines changed: 97 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,12 @@
4444
hex_bytes,
4545
bytes_encode,
4646
)
47-
from scapy.error import log_runtime, Scapy_Exception, warning
47+
from scapy.error import (
48+
log_interactive,
49+
log_runtime,
50+
Scapy_Exception,
51+
warning,
52+
)
4853
from scapy.pton_ntop import inet_pton
4954

5055
# Typing imports
@@ -392,50 +397,111 @@ def repr_hex(s):
392397

393398

394399
@conf.commands.register
395-
def hexdiff(a, b, autojunk=False):
396-
# type: (Union[Packet, AnyStr], Union[Packet, AnyStr], bool) -> None
400+
def hexdiff(
401+
a: Union['Packet', AnyStr],
402+
b: Union['Packet', AnyStr],
403+
algo: Optional[str] = None,
404+
autojunk: bool = False,
405+
) -> None:
397406
"""
398407
Show differences between 2 binary strings, Packets...
399408
400-
For the autojunk parameter, see
401-
https://docs.python.org/3.8/library/difflib.html#difflib.SequenceMatcher
409+
Available algorithms:
410+
- wagnerfischer: Use the Wagner and Fischer algorithm to compute the
411+
Levenstein distance between the strings then backtrack.
412+
- difflib: Use the difflib.SequenceMatcher implementation. This based on a
413+
modified version of the Ratcliff and Obershelp algorithm.
414+
This is much faster, but far less accurate.
415+
https://docs.python.org/3.8/library/difflib.html#difflib.SequenceMatcher
402416
403417
:param a:
404418
:param b: The binary strings, packets... to compare
405-
:param autojunk: Setting it to True will likely increase the comparison
406-
speed a lot on big byte strings, but will reduce accuracy (will tend
407-
to miss insertion and see replacements instead for instance).
419+
:param algo: Force the algo to be 'wagnerfischer' or 'difflib'.
420+
By default, this is chosen depending on the complexity, optimistically
421+
preferring wagnerfischer unless really necessary.
422+
:param autojunk: (difflib only) See difflib documentation.
408423
"""
409-
410-
# Compare the strings using difflib
411-
412424
xb = bytes_encode(a)
413425
yb = bytes_encode(b)
414426

415-
sm = difflib.SequenceMatcher(a=xb, b=yb, autojunk=autojunk)
416-
xarr = [xb[i:i + 1] for i in range(len(xb))]
417-
yarr = [yb[i:i + 1] for i in range(len(yb))]
427+
if algo is None:
428+
# Choose the best algorithm
429+
complexity = len(xb) * len(yb)
430+
if complexity < 1e7:
431+
# Comparing two (non-jumbos) Ethernet packets is ~2e6 which is manageable.
432+
# Anything much larger than this shouldn't be attempted by default.
433+
algo = "wagnerfischer"
434+
if complexity > 1e6:
435+
log_interactive.info(
436+
"Complexity is a bit high. hexdiff will take a few seconds."
437+
)
438+
else:
439+
algo = "difflib"
418440

419441
backtrackx = []
420442
backtracky = []
421-
for opcode in sm.get_opcodes():
422-
typ, x0, x1, y0, y1 = opcode
423-
if typ == 'delete':
424-
backtrackx += xarr[x0:x1]
425-
backtracky += [b''] * (x1 - x0)
426-
elif typ == 'insert':
427-
backtrackx += [b''] * (y1 - y0)
428-
backtracky += yarr[y0:y1]
429-
elif typ in ['equal', 'replace']:
430-
backtrackx += xarr[x0:x1]
431-
backtracky += yarr[y0:y1]
432-
433-
if autojunk:
443+
444+
if algo == "wagnerfischer":
445+
xb = xb[::-1]
446+
yb = yb[::-1]
447+
448+
# costs for the 3 operations
449+
INSERT = 1
450+
DELETE = 1
451+
SUBST = 1
452+
453+
# Typically, d[i,j] will hold the distance between
454+
# the first i characters of xb and the first j characters of yb.
455+
# We change the Wagner Fischer to also store pointers to all
456+
# the intermediate steps taken while calculating the Levenstein distance.
457+
d = {(-1, -1): (0, (-1, -1))}
458+
for j in range(len(yb)):
459+
d[-1, j] = (j + 1) * INSERT, (-1, j - 1)
460+
for i in range(len(xb)):
461+
d[i, -1] = (i + 1) * INSERT + 1, (i - 1, -1)
462+
463+
# Compute the Levenstein distance between the two strings, but
464+
# store all the steps to be able to backtrack at the end.
465+
for j in range(len(yb)):
466+
for i in range(len(xb)):
467+
d[i, j] = min(
468+
(d[i - 1, j - 1][0] + SUBST * (xb[i] != yb[j]), (i - 1, j - 1)),
469+
(d[i - 1, j][0] + DELETE, (i - 1, j)),
470+
(d[i, j - 1][0] + INSERT, (i, j - 1)),
471+
)
472+
473+
# Iterate through the steps backwards to create the diff
474+
i = len(xb) - 1
475+
j = len(yb) - 1
476+
while not (i == j == -1):
477+
i2, j2 = d[i, j][1]
478+
backtrackx.append(xb[i2 + 1:i + 1])
479+
backtracky.append(yb[j2 + 1:j + 1])
480+
i, j = i2, j2
481+
elif algo == "difflib":
482+
sm = difflib.SequenceMatcher(a=xb, b=yb, autojunk=autojunk)
483+
xarr = [xb[i:i + 1] for i in range(len(xb))]
484+
yarr = [yb[i:i + 1] for i in range(len(yb))]
485+
# Iterate through opcodes to build the backtrack
486+
for opcode in sm.get_opcodes():
487+
typ, x0, x1, y0, y1 = opcode
488+
if typ == 'delete':
489+
backtrackx += xarr[x0:x1]
490+
backtracky += [b''] * (x1 - x0)
491+
elif typ == 'insert':
492+
backtrackx += [b''] * (y1 - y0)
493+
backtracky += yarr[y0:y1]
494+
elif typ in ['equal', 'replace']:
495+
backtrackx += xarr[x0:x1]
496+
backtracky += yarr[y0:y1]
434497
# Some lines may have been considered as junk. Check the sizes
435-
lbx = len(backtrackx)
436-
lby = len(backtracky)
437-
backtrackx += [b''] * (max(lbx, lby) - lbx)
438-
backtracky += [b''] * (max(lbx, lby) - lby)
498+
if autojunk:
499+
lbx = len(backtrackx)
500+
lby = len(backtracky)
501+
backtrackx += [b''] * (max(lbx, lby) - lbx)
502+
backtracky += [b''] * (max(lbx, lby) - lby)
503+
else:
504+
raise ValueError("Unknown algorithm '%s'" % algo)
439505

440506
# Print the diff
441507

test/regression.uts

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -867,11 +867,11 @@ assert fletcher16_checkbytes(b"\x28\x07", 1) == b"\xaf("
867867

868868
= Test hexdiff function
869869
~ not_pypy
870-
def test_hexdiff(a, b, autojunk=False):
870+
def test_hexdiff(a, b, algo=None, autojunk=False):
871871
conf_color_theme = conf.color_theme
872872
conf.color_theme = BlackAndWhite()
873873
with ContextManagerCaptureOutput() as cmco:
874-
hexdiff(a, b, autojunk=autojunk)
874+
hexdiff(a, b, algo=algo, autojunk=autojunk)
875875
result_hexdiff = cmco.get_output()
876876
conf.interactive = True
877877
conf.color_theme = conf_color_theme
@@ -901,12 +901,12 @@ expected += "0010 7F 00 00 01 ....
901901
expected += " 0010 7F 00 00 02 ....\n"
902902
assert result_hexdiff == expected
903903

904-
# Compare using autojunk
904+
# Compare using difflib
905905

906906
a = "A" * 1000 + "findme" + "B" * 1000
907907
b = "A" * 1000 + "B" * 1000
908-
ret1 = test_hexdiff(a, b)
909-
ret2 = test_hexdiff(a, b, autojunk=True)
908+
ret1 = test_hexdiff(a, b, algo="difflib")
909+
ret2 = test_hexdiff(a, b, algo="difflib", autojunk=True)
910910

911911
expected_ret1 = """
912912
03d0 03d0 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 AAAAAAAAAAAAAAAA

0 commit comments

Comments
 (0)