@@ -28,7 +28,18 @@ class BufferedIOBase(object):
2828asciiUppercaseBytes = frozenset ([item .encode ("ascii" ) for item in asciiUppercase ])
2929spacesAngleBrackets = spaceCharactersBytes | frozenset ([b">" , b"<" ])
3030
31- invalid_unicode_re = re .compile ("[\u0001 -\u0008 \u000B \u000E -\u001F \u007F -\u009F \uD800 -\uDFFF \uFDD0 -\uFDEF \uFFFE \uFFFF \U0001FFFE \U0001FFFF \U0002FFFE \U0002FFFF \U0003FFFE \U0003FFFF \U0004FFFE \U0004FFFF \U0005FFFE \U0005FFFF \U0006FFFE \U0006FFFF \U0007FFFE \U0007FFFF \U0008FFFE \U0008FFFF \U0009FFFE \U0009FFFF \U000AFFFE \U000AFFFF \U000BFFFE \U000BFFFF \U000CFFFE \U000CFFFF \U000DFFFE \U000DFFFF \U000EFFFE \U000EFFFF \U000FFFFE \U000FFFFF \U0010FFFE \U0010FFFF ]" )
31+
32+ invalid_unicode_no_surrogate = "[\u0001 -\u0008 \u000B \u000E -\u001F \u007F -\u009F \uFDD0 -\uFDEF \uFFFE \uFFFF \U0001FFFE \U0001FFFF \U0002FFFE \U0002FFFF \U0003FFFE \U0003FFFF \U0004FFFE \U0004FFFF \U0005FFFE \U0005FFFF \U0006FFFE \U0006FFFF \U0007FFFE \U0007FFFF \U0008FFFE \U0008FFFF \U0009FFFE \U0009FFFF \U000AFFFE \U000AFFFF \U000BFFFE \U000BFFFF \U000CFFFE \U000CFFFF \U000DFFFE \U000DFFFF \U000EFFFE \U000EFFFF \U000FFFFE \U000FFFFF \U0010FFFE \U0010FFFF ]"
33+
34+ if utils .supports_lone_surrogates :
35+ # Use one extra step of indirection and create surrogates with
36+ # unichr. Not using this indirection would introduce an illegal
37+ # unicode literal on platforms not supporting such lone
38+ # surrogates.
39+ invalid_unicode_re = re .compile (invalid_unicode_no_surrogate +
40+ eval ('"\\ uD800-\\ uDFFF"' ))
41+ else :
42+ invalid_unicode_re = re .compile (invalid_unicode_no_surrogate )
3243
3344non_bmp_invalid_codepoints = set ([0x1FFFE , 0x1FFFF , 0x2FFFE , 0x2FFFF , 0x3FFFE ,
3445 0x3FFFF , 0x4FFFE , 0x4FFFF , 0x5FFFE , 0x5FFFF ,
@@ -164,13 +175,18 @@ def __init__(self, source):
164175
165176 """
166177
167- # Craziness
168- if len ("\U0010FFFF " ) == 1 :
178+ if not utils .supports_lone_surrogates :
179+ # Such platforms will have already checked for such
180+ # surrogate errors, so no need to do this checking.
181+ self .reportCharacterErrors = None
182+ self .replaceCharactersRegexp = None
183+ elif len ("\U0010FFFF " ) == 1 :
169184 self .reportCharacterErrors = self .characterErrorsUCS4
170- self .replaceCharactersRegexp = re .compile ("[\uD800 -\uDFFF ]" )
185+ self .replaceCharactersRegexp = re .compile (eval ( ' "[\\ uD800-\\ uDFFF]"' ) )
171186 else :
172187 self .reportCharacterErrors = self .characterErrorsUCS2
173- self .replaceCharactersRegexp = re .compile ("([\uD800 -\uDBFF ](?![\uDC00 -\uDFFF ])|(?<![\uD800 -\uDBFF ])[\uDC00 -\uDFFF ])" )
188+ self .replaceCharactersRegexp = re .compile (
189+ eval ('"([\\ uD800-\\ uDBFF](?![\\ uDC00-\\ uDFFF])|(?<![\\ uD800-\\ uDBFF])[\\ uDC00-\\ uDFFF])"' ))
174190
175191 # List of where new lines occur
176192 self .newLines = [0 ]
@@ -265,11 +281,12 @@ def readChunk(self, chunkSize=None):
265281 self ._bufferedCharacter = data [- 1 ]
266282 data = data [:- 1 ]
267283
268- self .reportCharacterErrors (data )
284+ if self .reportCharacterErrors :
285+ self .reportCharacterErrors (data )
269286
270- # Replace invalid characters
271- # Note U+0000 is dealt with in the tokenizer
272- data = self .replaceCharactersRegexp .sub ("\ufffd " , data )
287+ # Replace invalid characters
288+ # Note U+0000 is dealt with in the tokenizer
289+ data = self .replaceCharactersRegexp .sub ("\ufffd " , data )
273290
274291 data = data .replace ("\r \n " , "\n " )
275292 data = data .replace ("\r " , "\n " )
0 commit comments