@@ -185,14 +185,10 @@ def __init__(self, source):
185185 # Such platforms will have already checked for such
186186 # surrogate errors, so no need to do this checking.
187187 self .reportCharacterErrors = None
188- self .replaceCharactersRegexp = None
189188 elif len ("\U0010FFFF " ) == 1 :
190189 self .reportCharacterErrors = self .characterErrorsUCS4
191- self .replaceCharactersRegexp = re .compile (eval ('"[\\ uD800-\\ uDFFF]"' ))
192190 else :
193191 self .reportCharacterErrors = self .characterErrorsUCS2
194- self .replaceCharactersRegexp = re .compile (
195- eval ('"([\\ uD800-\\ uDBFF](?![\\ uDC00-\\ uDFFF])|(?<![\\ uD800-\\ uDBFF])[\\ uDC00-\\ uDFFF])"' ))
196192
197193 # List of where new lines occur
198194 self .newLines = [0 ]
@@ -290,10 +286,7 @@ def readChunk(self, chunkSize=None):
290286 if self .reportCharacterErrors :
291287 self .reportCharacterErrors (data )
292288
293- # Replace invalid characters
294- # Note U+0000 is dealt with in the tokenizer
295- data = self .replaceCharactersRegexp .sub ("\ufffd " , data )
296-
289+ # Replace invalid characters
297290 data = data .replace ("\r \n " , "\n " )
298291 data = data .replace ("\r " , "\n " )
299292
0 commit comments