|
43 | 43 |
|
44 | 44 | LITERALS = (ast.Num, ast.Str) |
45 | 45 |
|
| 46 | +# A variant of the 'replace' error handler that replaces unencodable characters with U+FFFD |
| 47 | +# rather than '?'. Without this, a string like '\uD800' (which is not encodable) would get mapped |
| 48 | +# to '?', and potentially clash with the regular string '?' if it appeared elsewhere in the source |
| 49 | +# code. Used in 'get_label_for_object' below. Based on code from https://peps.python.org/pep-0293/ |
| 50 | +def fffd_replace(exc): |
| 51 | + if isinstance(exc, UnicodeEncodeError): |
| 52 | + return ((exc.end-exc.start)*u"\\ufffd", exc.end) |
| 53 | + elif isinstance(exc, UnicodeDecodeError): |
| 54 | + return (u"\\ufffd", exc.end) |
| 55 | + elif isinstance(exc, UnicodeTranslateError): |
| 56 | + return ((exc.end-exc.start)*u"\\ufffd", exc.end) |
| 57 | + else: |
| 58 | + raise TypeError("can't handle %s" % exc.__name__) |
| 59 | + |
| 60 | +import codecs |
| 61 | +codecs.register_error("fffdreplace", fffd_replace) |
| 62 | + |
46 | 63 | class _CObject(object): |
47 | 64 | '''Utility class to wrap arbitrary C objects. |
48 | 65 | Treat all objects as unique. Rely on naming in the |
@@ -239,7 +256,7 @@ def get_label_for_object(self, obj, default_label, obj_type): |
239 | 256 | else: |
240 | 257 | prefix = u"C_bytes$" |
241 | 258 | if t is str: |
242 | | - obj = obj.encode("utf8", errors='replace') |
| 259 | + obj = obj.encode("utf8", errors='fffdreplace') |
243 | 260 | return prefix + hashlib.sha1(obj).hexdigest() |
244 | 261 | if t is bytes: |
245 | 262 | return prefix + hashlib.sha1(obj).hexdigest() |
|
0 commit comments