Skip to content
This repository was archived by the owner on Aug 5, 2024. It is now read-only.

Commit 535e29e

Browse files
committed
Add fixes for Python2
1 parent 8d2a5f8 commit 535e29e

File tree

2 files changed

+29
-0
lines changed

2 files changed

+29
-0
lines changed

python2/diff_match_patch.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
__author__ = 'fraser@google.com (Neil Fraser)'
2929

3030
import re
31+
import struct
3132
import sys
3233
import time
3334
import urllib
@@ -1135,6 +1136,16 @@ def diff_levenshtein(self, diffs):
11351136
levenshtein += max(insertions, deletions)
11361137
return levenshtein
11371138

1139+
@classmethod
1140+
def is_high_surrogate(cls, utf16be_bytes):
1141+
c = struct.unpack('>H', utf16be_bytes)[0]
1142+
return c >= 0xd800 and c <= 0xdbff
1143+
1144+
@classmethod
1145+
def is_low_surrogate(cls, utf16be_bytes):
1146+
c = struct.unpack('>H', utf16be_bytes)[0]
1147+
return c >= 0xdc00 and c <= 0xdfff
1148+
11381149
def diff_toDelta(self, diffs):
11391150
"""Crush the diff into an encoded string which describes the operations
11401151
required to transform text1 into text2.
@@ -1148,7 +1159,21 @@ def diff_toDelta(self, diffs):
11481159
Delta text.
11491160
"""
11501161
text = []
1162+
last_end = None
11511163
for (op, data) in diffs:
1164+
encoded = data.encode('utf-16be')
1165+
this_top = encoded[0:2]
1166+
this_end = encoded[-2:]
1167+
1168+
if self.is_high_surrogate(this_end):
1169+
encoded = encoded[0:-2]
1170+
1171+
if last_end and self.is_high_surrogate(last_end) and self.is_low_surrogate(this_top):
1172+
encoded = last_end + encoded
1173+
1174+
data = encoded.decode('utf-16be')
1175+
last_end = this_end
1176+
11521177
if op == self.DIFF_INSERT:
11531178
# High ascii will raise UnicodeDecodeError. Use Unicode instead.
11541179
data = data.encode("utf-8")

python2/tests/diff_match_patch_test.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -441,6 +441,10 @@ def testDiffDelta(self):
441441
# Convert delta string into a diff.
442442
self.assertEquals(diffs, self.dmp.diff_fromDelta(text1, delta))
443443

444+
diffs = [(self.dmp.DIFF_EQUAL, u"\ud83d\ude4b\ud83d"), (self.dmp.DIFF_INSERT, u"\ude4c\ud83d"), (self.dmp.DIFF_EQUAL, u"\ude4b")]
445+
delta = self.dmp.diff_toDelta(diffs)
446+
self.assertEquals("=2\t+%F0%9F%99%8C\t=2", delta)
447+
444448
# Verify pool of unchanged characters.
445449
diffs = [(self.dmp.DIFF_INSERT, "A-Z a-z 0-9 - _ . ! ~ * ' ( ) ; / ? : @ & = + $ , # ")]
446450
text2 = self.dmp.diff_text2(diffs)

0 commit comments

Comments
 (0)