Adjust Python3 code

dmsnell · dmsnell · commit d0a578f152bc · 2019-11-10T10:56:35.000-06:00
I'm not sure that I made the right assumptions about Python3's Unicode
handling when I made the first patch to it.

By constructing the specific `diffs` output I created a sequence of code
units that `diff_main` in Python3 would _not_ have made because it's
operating on Unicode code points natively when finding the common
prefix.

Therefore I do not think that the Python3 library experienced this
problem as the others did. Nonetheless it _has_ been reporting the diff
length differently than in other languages and I have left that change
in there.

Of note, it doesn't look like we have true harmony between the languages
despite the appearance of such. The `lua` wiki page makes this clear,
but at least with Python we have the ability to harmonize the meaning of
the lengths and I have done that in this change.
diff --git a/python3/diff_match_patch.py b/python3/diff_match_patch.py
@@ -1134,16 +1134,6 @@ def diff_levenshtein(self, diffs):
     levenshtein += max(insertions, deletions)
     return levenshtein
 
-  @classmethod
-  def is_high_surrogate(cls, utf16be_bytes):
-    c = struct.unpack('>H', utf16be_bytes)[0]
-    return c >= 0xd800 and c <= 0xdbff
-
-  @classmethod
-  def is_low_surrogate(cls, utf16be_bytes):
-    c = struct.unpack('>H', utf16be_bytes)[0]
-    return c >= 0xdc00 and c <= 0xdfff
-
   def diff_toDelta(self, diffs):
     """Crush the diff into an encoded string which describes the operations
     required to transform text1 into text2.
@@ -1159,18 +1149,6 @@ def diff_toDelta(self, diffs):
     text = []
     last_end = None
     for (op, data) in diffs:
-      encoded = data.encode('utf-16be', 'surrogatepass')
-      this_top = encoded[0:2]
-      this_end = encoded[-2:]
-
-      if self.is_high_surrogate(this_end):
-        encoded = encoded[0:-2]
-
-      if last_end and self.is_high_surrogate(last_end) and self.is_low_surrogate(this_top):
-        encoded = last_end + encoded
-
-      data = encoded.decode('utf-16be', 'surrogateescape')
-      last_end = this_end
       if op == self.DIFF_INSERT:
         # High ascii will raise UnicodeDecodeError.  Use Unicode instead.
         data = data.encode("utf-8")
diff --git a/python3/tests/diff_match_patch_test.py b/python3/tests/diff_match_patch_test.py
@@ -445,7 +445,7 @@ def testDiffDelta(self):
     # Convert delta string into a diff.
     self.assertEqual(diffs, self.dmp.diff_fromDelta(text1, delta))
 
-    diffs = [(self.dmp.DIFF_EQUAL, "\ud83d\ude4b\ud83d"), (self.dmp.DIFF_INSERT, "\ude4c\ud83d"), (self.dmp.DIFF_EQUAL, "\ude4b")]
+    diffs = self.dmp.diff_main("\U0001F64B\U0001F64B", "\U0001F64B\U0001F64C\U0001F64B")
     delta = self.dmp.diff_toDelta(diffs)
     self.assertEqual("=2\t+%F0%9F%99%8C\t=2", delta)