|
44 | 44 | hex_bytes, |
45 | 45 | bytes_encode, |
46 | 46 | ) |
47 | | -from scapy.error import log_runtime, Scapy_Exception, warning |
| 47 | +from scapy.error import ( |
| 48 | + log_interactive, |
| 49 | + log_runtime, |
| 50 | + Scapy_Exception, |
| 51 | + warning, |
| 52 | +) |
48 | 53 | from scapy.pton_ntop import inet_pton |
49 | 54 |
|
50 | 55 | # Typing imports |
@@ -392,50 +397,111 @@ def repr_hex(s): |
392 | 397 |
|
393 | 398 |
|
394 | 399 | @conf.commands.register |
395 | | -def hexdiff(a, b, autojunk=False): |
396 | | - # type: (Union[Packet, AnyStr], Union[Packet, AnyStr], bool) -> None |
| 400 | +def hexdiff( |
| 401 | + a: Union['Packet', AnyStr], |
| 402 | + b: Union['Packet', AnyStr], |
| 403 | + algo: Optional[str] = None, |
| 404 | + autojunk: bool = False, |
| 405 | +) -> None: |
397 | 406 | """ |
398 | 407 | Show differences between 2 binary strings, Packets... |
399 | 408 |
|
400 | | - For the autojunk parameter, see |
401 | | - https://docs.python.org/3.8/library/difflib.html#difflib.SequenceMatcher |
| 409 | + Available algorithms: |
| 410 | + - wagnerfischer: Use the Wagner and Fischer algorithm to compute the |
| 411 | + Levenstein distance between the strings then backtrack. |
| 412 | + - difflib: Use the difflib.SequenceMatcher implementation. This based on a |
| 413 | + modified version of the Ratcliff and Obershelp algorithm. |
| 414 | + This is much faster, but far less accurate. |
| 415 | + https://docs.python.org/3.8/library/difflib.html#difflib.SequenceMatcher |
402 | 416 |
|
403 | 417 | :param a: |
404 | 418 | :param b: The binary strings, packets... to compare |
405 | | - :param autojunk: Setting it to True will likely increase the comparison |
406 | | - speed a lot on big byte strings, but will reduce accuracy (will tend |
407 | | - to miss insertion and see replacements instead for instance). |
| 419 | + :param algo: Force the algo to be 'wagnerfischer' or 'difflib'. |
| 420 | + By default, this is chosen depending on the complexity, optimistically |
| 421 | + preferring wagnerfischer unless really necessary. |
| 422 | + :param autojunk: (difflib only) See difflib documentation. |
408 | 423 | """ |
409 | | - |
410 | | - # Compare the strings using difflib |
411 | | - |
412 | 424 | xb = bytes_encode(a) |
413 | 425 | yb = bytes_encode(b) |
414 | 426 |
|
415 | | - sm = difflib.SequenceMatcher(a=xb, b=yb, autojunk=autojunk) |
416 | | - xarr = [xb[i:i + 1] for i in range(len(xb))] |
417 | | - yarr = [yb[i:i + 1] for i in range(len(yb))] |
| 427 | + if algo is None: |
| 428 | + # Choose the best algorithm |
| 429 | + complexity = len(xb) * len(yb) |
| 430 | + if complexity < 1e7: |
| 431 | + # Comparing two (non-jumbos) Ethernet packets is ~2e6 which is manageable. |
| 432 | + # Anything much larger than this shouldn't be attempted by default. |
| 433 | + algo = "wagnerfischer" |
| 434 | + if complexity > 1e6: |
| 435 | + log_interactive.info( |
| 436 | + "Complexity is a bit high. hexdiff will take a few seconds." |
| 437 | + ) |
| 438 | + else: |
| 439 | + algo = "difflib" |
418 | 440 |
|
419 | 441 | backtrackx = [] |
420 | 442 | backtracky = [] |
421 | | - for opcode in sm.get_opcodes(): |
422 | | - typ, x0, x1, y0, y1 = opcode |
423 | | - if typ == 'delete': |
424 | | - backtrackx += xarr[x0:x1] |
425 | | - backtracky += [b''] * (x1 - x0) |
426 | | - elif typ == 'insert': |
427 | | - backtrackx += [b''] * (y1 - y0) |
428 | | - backtracky += yarr[y0:y1] |
429 | | - elif typ in ['equal', 'replace']: |
430 | | - backtrackx += xarr[x0:x1] |
431 | | - backtracky += yarr[y0:y1] |
432 | | - |
433 | | - if autojunk: |
| 443 | + |
| 444 | + if algo == "wagnerfischer": |
| 445 | + xb = xb[::-1] |
| 446 | + yb = yb[::-1] |
| 447 | + |
| 448 | + # costs for the 3 operations |
| 449 | + INSERT = 1 |
| 450 | + DELETE = 1 |
| 451 | + SUBST = 1 |
| 452 | + |
| 453 | + # Typically, d[i,j] will hold the distance between |
| 454 | + # the first i characters of xb and the first j characters of yb. |
| 455 | + # We change the Wagner Fischer to also store pointers to all |
| 456 | + # the intermediate steps taken while calculating the Levenstein distance. |
| 457 | + d = {(-1, -1): (0, (-1, -1))} |
| 458 | + for j in range(len(yb)): |
| 459 | + d[-1, j] = (j + 1) * INSERT, (-1, j - 1) |
| 460 | + for i in range(len(xb)): |
| 461 | + d[i, -1] = (i + 1) * INSERT + 1, (i - 1, -1) |
| 462 | + |
| 463 | + # Compute the Levenstein distance between the two strings, but |
| 464 | + # store all the steps to be able to backtrack at the end. |
| 465 | + for j in range(len(yb)): |
| 466 | + for i in range(len(xb)): |
| 467 | + d[i, j] = min( |
| 468 | + (d[i - 1, j - 1][0] + SUBST * (xb[i] != yb[j]), (i - 1, j - 1)), |
| 469 | + (d[i - 1, j][0] + DELETE, (i - 1, j)), |
| 470 | + (d[i, j - 1][0] + INSERT, (i, j - 1)), |
| 471 | + ) |
| 472 | + |
| 473 | + # Iterate through the steps backwards to create the diff |
| 474 | + i = len(xb) - 1 |
| 475 | + j = len(yb) - 1 |
| 476 | + while not (i == j == -1): |
| 477 | + i2, j2 = d[i, j][1] |
| 478 | + backtrackx.append(xb[i2 + 1:i + 1]) |
| 479 | + backtracky.append(yb[j2 + 1:j + 1]) |
| 480 | + i, j = i2, j2 |
| 481 | + elif algo == "difflib": |
| 482 | + sm = difflib.SequenceMatcher(a=xb, b=yb, autojunk=autojunk) |
| 483 | + xarr = [xb[i:i + 1] for i in range(len(xb))] |
| 484 | + yarr = [yb[i:i + 1] for i in range(len(yb))] |
| 485 | + # Iterate through opcodes to build the backtrack |
| 486 | + for opcode in sm.get_opcodes(): |
| 487 | + typ, x0, x1, y0, y1 = opcode |
| 488 | + if typ == 'delete': |
| 489 | + backtrackx += xarr[x0:x1] |
| 490 | + backtracky += [b''] * (x1 - x0) |
| 491 | + elif typ == 'insert': |
| 492 | + backtrackx += [b''] * (y1 - y0) |
| 493 | + backtracky += yarr[y0:y1] |
| 494 | + elif typ in ['equal', 'replace']: |
| 495 | + backtrackx += xarr[x0:x1] |
| 496 | + backtracky += yarr[y0:y1] |
434 | 497 | # Some lines may have been considered as junk. Check the sizes |
435 | | - lbx = len(backtrackx) |
436 | | - lby = len(backtracky) |
437 | | - backtrackx += [b''] * (max(lbx, lby) - lbx) |
438 | | - backtracky += [b''] * (max(lbx, lby) - lby) |
| 498 | + if autojunk: |
| 499 | + lbx = len(backtrackx) |
| 500 | + lby = len(backtracky) |
| 501 | + backtrackx += [b''] * (max(lbx, lby) - lbx) |
| 502 | + backtracky += [b''] * (max(lbx, lby) - lby) |
| 503 | + else: |
| 504 | + raise ValueError("Unknown algorithm '%s'" % algo) |
439 | 505 |
|
440 | 506 | # Print the diff |
441 | 507 |
|
|
0 commit comments