2828__author__ = 'fraser@google.com (Neil Fraser)'
2929
3030import re
31+ import struct
3132import sys
3233import time
3334import urllib
@@ -1135,6 +1136,16 @@ def diff_levenshtein(self, diffs):
11351136 levenshtein += max (insertions , deletions )
11361137 return levenshtein
11371138
1139+ @classmethod
1140+ def is_high_surrogate (cls , utf16be_bytes ):
1141+ c = struct .unpack ('>H' , utf16be_bytes )[0 ]
1142+ return c >= 0xd800 and c <= 0xdbff
1143+
1144+ @classmethod
1145+ def is_low_surrogate (cls , utf16be_bytes ):
1146+ c = struct .unpack ('>H' , utf16be_bytes )[0 ]
1147+ return c >= 0xdc00 and c <= 0xdfff
1148+
11381149 def diff_toDelta (self , diffs ):
11391150 """Crush the diff into an encoded string which describes the operations
11401151 required to transform text1 into text2.
@@ -1148,7 +1159,21 @@ def diff_toDelta(self, diffs):
11481159 Delta text.
11491160 """
11501161 text = []
1162+ last_end = None
11511163 for (op , data ) in diffs :
1164+ encoded = data .encode ('utf-16be' )
1165+ this_top = encoded [0 :2 ]
1166+ this_end = encoded [- 2 :]
1167+
1168+ if self .is_high_surrogate (this_end ):
1169+ encoded = encoded [0 :- 2 ]
1170+
1171+ if last_end and self .is_high_surrogate (last_end ) and self .is_low_surrogate (this_top ):
1172+ encoded = last_end + encoded
1173+
1174+ data = encoded .decode ('utf-16be' )
1175+ last_end = this_end
1176+
11521177 if op == self .DIFF_INSERT :
11531178 # High ascii will raise UnicodeDecodeError. Use Unicode instead.
11541179 data = data .encode ("utf-8" )
0 commit comments