|
3 | 3 |
|
4 | 4 | import re |
5 | 5 |
|
| 6 | +from codecs import register_error, xmlcharrefreplace_errors |
| 7 | + |
6 | 8 | from ..constants import voidElements, booleanAttributes, spaceCharacters |
7 | 9 | from ..constants import rcdataElements, entities, xmlEntities |
8 | 10 | from .. import utils |
|
21 | 23 | "\u2008\u2009\u200a\u2028\u2029\u202f\u205f" |
22 | 24 | "\u3000]") |
23 | 25 |
|
24 | | -try: |
25 | | - from codecs import register_error, xmlcharrefreplace_errors |
26 | | -except ImportError: |
27 | | - unicode_encode_errors = "strict" |
28 | | -else: |
29 | | - unicode_encode_errors = "htmlentityreplace" |
30 | | - |
31 | | - encode_entity_map = {} |
32 | | - is_ucs4 = len("\U0010FFFF") == 1 |
33 | | - for k, v in list(entities.items()): |
34 | | - # skip multi-character entities |
35 | | - if ((is_ucs4 and len(v) > 1) or |
36 | | - (not is_ucs4 and len(v) > 2)): |
37 | | - continue |
38 | | - if v != "&": |
39 | | - if len(v) == 2: |
40 | | - v = utils.surrogatePairToCodepoint(v) |
41 | | - else: |
42 | | - v = ord(v) |
43 | | - if v not in encode_entity_map or k.islower(): |
44 | | - # prefer < over < and similarly for &, >, etc. |
45 | | - encode_entity_map[v] = k |
46 | | - |
47 | | - def htmlentityreplace_errors(exc): |
48 | | - if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)): |
49 | | - res = [] |
50 | | - codepoints = [] |
51 | | - skip = False |
52 | | - for i, c in enumerate(exc.object[exc.start:exc.end]): |
53 | | - if skip: |
54 | | - skip = False |
55 | | - continue |
56 | | - index = i + exc.start |
57 | | - if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]): |
58 | | - codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2]) |
59 | | - skip = True |
60 | | - else: |
61 | | - codepoint = ord(c) |
62 | | - codepoints.append(codepoint) |
63 | | - for cp in codepoints: |
64 | | - e = encode_entity_map.get(cp) |
65 | | - if e: |
66 | | - res.append("&") |
67 | | - res.append(e) |
68 | | - if not e.endswith(";"): |
69 | | - res.append(";") |
70 | | - else: |
71 | | - res.append("&#x%s;" % (hex(cp)[2:])) |
72 | | - return ("".join(res), exc.end) |
73 | | - else: |
74 | | - return xmlcharrefreplace_errors(exc) |
75 | 26 |
|
76 | | - register_error(unicode_encode_errors, htmlentityreplace_errors) |
| 27 | +encode_entity_map = {} |
| 28 | +is_ucs4 = len("\U0010FFFF") == 1 |
| 29 | +for k, v in list(entities.items()): |
| 30 | + # skip multi-character entities |
| 31 | + if ((is_ucs4 and len(v) > 1) or |
| 32 | + (not is_ucs4 and len(v) > 2)): |
| 33 | + continue |
| 34 | + if v != "&": |
| 35 | + if len(v) == 2: |
| 36 | + v = utils.surrogatePairToCodepoint(v) |
| 37 | + else: |
| 38 | + v = ord(v) |
| 39 | + if v not in encode_entity_map or k.islower(): |
| 40 | + # prefer < over < and similarly for &, >, etc. |
| 41 | + encode_entity_map[v] = k |
| 42 | + |
| 43 | + |
| 44 | +def htmlentityreplace_errors(exc): |
| 45 | + if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)): |
| 46 | + res = [] |
| 47 | + codepoints = [] |
| 48 | + skip = False |
| 49 | + for i, c in enumerate(exc.object[exc.start:exc.end]): |
| 50 | + if skip: |
| 51 | + skip = False |
| 52 | + continue |
| 53 | + index = i + exc.start |
| 54 | + if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]): |
| 55 | + codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2]) |
| 56 | + skip = True |
| 57 | + else: |
| 58 | + codepoint = ord(c) |
| 59 | + codepoints.append(codepoint) |
| 60 | + for cp in codepoints: |
| 61 | + e = encode_entity_map.get(cp) |
| 62 | + if e: |
| 63 | + res.append("&") |
| 64 | + res.append(e) |
| 65 | + if not e.endswith(";"): |
| 66 | + res.append(";") |
| 67 | + else: |
| 68 | + res.append("&#x%s;" % (hex(cp)[2:])) |
| 69 | + return ("".join(res), exc.end) |
| 70 | + else: |
| 71 | + return xmlcharrefreplace_errors(exc) |
77 | 72 |
|
78 | | - del register_error |
| 73 | +register_error("htmlentityreplace", htmlentityreplace_errors) |
79 | 74 |
|
80 | 75 |
|
81 | 76 | class HTMLSerializer(object): |
@@ -168,7 +163,7 @@ def __init__(self, **kwargs): |
168 | 163 | def encode(self, string): |
169 | 164 | assert(isinstance(string, text_type)) |
170 | 165 | if self.encoding: |
171 | | - return string.encode(self.encoding, unicode_encode_errors) |
| 166 | + return string.encode(self.encoding, "htmlentityreplace") |
172 | 167 | else: |
173 | 168 | return string |
174 | 169 |
|
|
0 commit comments