Add fixes for Python2

dmsnell · dmsnell · commit 535e29e8447e · 2019-11-10T00:28:26.000-06:00
diff --git a/python2/diff_match_patch.py b/python2/diff_match_patch.py
@@ -28,6 +28,7 @@
 __author__ = 'fraser@google.com (Neil Fraser)'
 
 import re
+import struct
 import sys
 import time
 import urllib
@@ -1135,6 +1136,16 @@ def diff_levenshtein(self, diffs):
     levenshtein += max(insertions, deletions)
     return levenshtein
 
+  @classmethod
+  def is_high_surrogate(cls, utf16be_bytes):
+    c = struct.unpack('>H', utf16be_bytes)[0]
+    return c >= 0xd800 and c <= 0xdbff
+
+  @classmethod
+  def is_low_surrogate(cls, utf16be_bytes):
+    c = struct.unpack('>H', utf16be_bytes)[0]
+    return c >= 0xdc00 and c <= 0xdfff
+
   def diff_toDelta(self, diffs):
     """Crush the diff into an encoded string which describes the operations
     required to transform text1 into text2.
@@ -1148,7 +1159,21 @@ def diff_toDelta(self, diffs):
       Delta text.
     """
     text = []
+    last_end = None
     for (op, data) in diffs:
+      encoded = data.encode('utf-16be')
+      this_top = encoded[0:2]
+      this_end = encoded[-2:]
+
+      if self.is_high_surrogate(this_end):
+        encoded = encoded[0:-2]
+
+      if last_end and self.is_high_surrogate(last_end) and self.is_low_surrogate(this_top):
+        encoded = last_end + encoded
+
+      data = encoded.decode('utf-16be')
+      last_end = this_end
+
       if op == self.DIFF_INSERT:
         # High ascii will raise UnicodeDecodeError.  Use Unicode instead.
         data = data.encode("utf-8")
diff --git a/python2/tests/diff_match_patch_test.py b/python2/tests/diff_match_patch_test.py
@@ -441,6 +441,10 @@ def testDiffDelta(self):
     # Convert delta string into a diff.
     self.assertEquals(diffs, self.dmp.diff_fromDelta(text1, delta))
 
+    diffs = [(self.dmp.DIFF_EQUAL, u"\ud83d\ude4b\ud83d"), (self.dmp.DIFF_INSERT, u"\ude4c\ud83d"), (self.dmp.DIFF_EQUAL, u"\ude4b")]
+    delta = self.dmp.diff_toDelta(diffs)
+    self.assertEquals("=2\t+%F0%9F%99%8C\t=2", delta)
+
     # Verify pool of unchanged characters.
     diffs = [(self.dmp.DIFF_INSERT, "A-Z a-z 0-9 - _ . ! ~ * ' ( ) ; / ? : @ & = + $ , # ")]
     text2 = self.dmp.diff_text2(diffs)