1- from __future__ import absolute_import
1+ from __future__ import absolute_import , unicode_literals
22from future .utils import PY3
33__future_module__ = True
44
1313 General functions for HTML manipulation.
1414 """
1515
16+ import re as _re
17+ from future .moves .html .entities import html5 as _html5
18+
19+ _chr = chr
20+ def chr (num ):
21+ if num in range (256 ):
22+ return _chr (num )
23+ try :
24+ return unichr (num )
25+ except ValueError :
26+ return str ('\\ U%08x' % num ).decode ('unicode-escape' )
27+
1628 def escape (s , quote = True ):
1729 """
1830 Replace special characters "&", "<" and ">" to HTML-safe sequences.
@@ -28,4 +40,111 @@ def escape(s, quote=True):
2840 s = s .replace ('\' ' , "'" )
2941 return s
3042
31- __all__ = ['escape' ]
43+
44+ # see http://www.w3.org/TR/html5/syntax.html#tokenizing-character-references
45+
46+ _invalid_charrefs = {
47+ 0x00 : '\ufffd ' , # REPLACEMENT CHARACTER
48+ 0x0d : '\r ' , # CARRIAGE RETURN
49+ 0x80 : '\u20ac ' , # EURO SIGN
50+ 0x81 : '\x81 ' , # <control>
51+ 0x82 : '\u201a ' , # SINGLE LOW-9 QUOTATION MARK
52+ 0x83 : '\u0192 ' , # LATIN SMALL LETTER F WITH HOOK
53+ 0x84 : '\u201e ' , # DOUBLE LOW-9 QUOTATION MARK
54+ 0x85 : '\u2026 ' , # HORIZONTAL ELLIPSIS
55+ 0x86 : '\u2020 ' , # DAGGER
56+ 0x87 : '\u2021 ' , # DOUBLE DAGGER
57+ 0x88 : '\u02c6 ' , # MODIFIER LETTER CIRCUMFLEX ACCENT
58+ 0x89 : '\u2030 ' , # PER MILLE SIGN
59+ 0x8a : '\u0160 ' , # LATIN CAPITAL LETTER S WITH CARON
60+ 0x8b : '\u2039 ' , # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
61+ 0x8c : '\u0152 ' , # LATIN CAPITAL LIGATURE OE
62+ 0x8d : '\x8d ' , # <control>
63+ 0x8e : '\u017d ' , # LATIN CAPITAL LETTER Z WITH CARON
64+ 0x8f : '\x8f ' , # <control>
65+ 0x90 : '\x90 ' , # <control>
66+ 0x91 : '\u2018 ' , # LEFT SINGLE QUOTATION MARK
67+ 0x92 : '\u2019 ' , # RIGHT SINGLE QUOTATION MARK
68+ 0x93 : '\u201c ' , # LEFT DOUBLE QUOTATION MARK
69+ 0x94 : '\u201d ' , # RIGHT DOUBLE QUOTATION MARK
70+ 0x95 : '\u2022 ' , # BULLET
71+ 0x96 : '\u2013 ' , # EN DASH
72+ 0x97 : '\u2014 ' , # EM DASH
73+ 0x98 : '\u02dc ' , # SMALL TILDE
74+ 0x99 : '\u2122 ' , # TRADE MARK SIGN
75+ 0x9a : '\u0161 ' , # LATIN SMALL LETTER S WITH CARON
76+ 0x9b : '\u203a ' , # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
77+ 0x9c : '\u0153 ' , # LATIN SMALL LIGATURE OE
78+ 0x9d : '\x9d ' , # <control>
79+ 0x9e : '\u017e ' , # LATIN SMALL LETTER Z WITH CARON
80+ 0x9f : '\u0178 ' , # LATIN CAPITAL LETTER Y WITH DIAERESIS
81+ }
82+
83+ _invalid_codepoints = {
84+ # 0x0001 to 0x0008
85+ 0x1 , 0x2 , 0x3 , 0x4 , 0x5 , 0x6 , 0x7 , 0x8 ,
86+ # 0x000E to 0x001F
87+ 0xe , 0xf , 0x10 , 0x11 , 0x12 , 0x13 , 0x14 , 0x15 , 0x16 , 0x17 , 0x18 , 0x19 ,
88+ 0x1a , 0x1b , 0x1c , 0x1d , 0x1e , 0x1f ,
89+ # 0x007F to 0x009F
90+ 0x7f , 0x80 , 0x81 , 0x82 , 0x83 , 0x84 , 0x85 , 0x86 , 0x87 , 0x88 , 0x89 , 0x8a ,
91+ 0x8b , 0x8c , 0x8d , 0x8e , 0x8f , 0x90 , 0x91 , 0x92 , 0x93 , 0x94 , 0x95 , 0x96 ,
92+ 0x97 , 0x98 , 0x99 , 0x9a , 0x9b , 0x9c , 0x9d , 0x9e , 0x9f ,
93+ # 0xFDD0 to 0xFDEF
94+ 0xfdd0 , 0xfdd1 , 0xfdd2 , 0xfdd3 , 0xfdd4 , 0xfdd5 , 0xfdd6 , 0xfdd7 , 0xfdd8 ,
95+ 0xfdd9 , 0xfdda , 0xfddb , 0xfddc , 0xfddd , 0xfdde , 0xfddf , 0xfde0 , 0xfde1 ,
96+ 0xfde2 , 0xfde3 , 0xfde4 , 0xfde5 , 0xfde6 , 0xfde7 , 0xfde8 , 0xfde9 , 0xfdea ,
97+ 0xfdeb , 0xfdec , 0xfded , 0xfdee , 0xfdef ,
98+ # others
99+ 0xb , 0xfffe , 0xffff , 0x1fffe , 0x1ffff , 0x2fffe , 0x2ffff , 0x3fffe , 0x3ffff ,
100+ 0x4fffe , 0x4ffff , 0x5fffe , 0x5ffff , 0x6fffe , 0x6ffff , 0x7fffe , 0x7ffff ,
101+ 0x8fffe , 0x8ffff , 0x9fffe , 0x9ffff , 0xafffe , 0xaffff , 0xbfffe , 0xbffff ,
102+ 0xcfffe , 0xcffff , 0xdfffe , 0xdffff , 0xefffe , 0xeffff , 0xffffe , 0xfffff ,
103+ 0x10fffe , 0x10ffff
104+ }
105+
106+
107+ def _replace_charref (s ):
108+ s = s .group (1 )
109+ if s [0 ] == '#' :
110+ # numeric charref
111+ if s [1 ] in 'xX' :
112+ num = int (s [2 :].rstrip (';' ), 16 )
113+ else :
114+ num = int (s [1 :].rstrip (';' ))
115+ if num in _invalid_charrefs :
116+ return _invalid_charrefs [num ]
117+ if 0xD800 <= num <= 0xDFFF or num > 0x10FFFF :
118+ return '\uFFFD '
119+ if num in _invalid_codepoints :
120+ return ''
121+ return chr (num )
122+ else :
123+ # named charref
124+ if s in _html5 :
125+ return _html5 [s ]
126+ # find the longest matching name (as defined by the standard)
127+ for x in range (len (s )- 1 , 1 , - 1 ):
128+ if s [:x ] in _html5 :
129+ return _html5 [s [:x ]] + s [x :]
130+ else :
131+ return '&' + s
132+
133+
134+ _charref = _re .compile (r'&(#[0-9]+;?'
135+ r'|#[xX][0-9a-fA-F]+;?'
136+ r'|[^\t\n\f <&#;]{1,32};?)' )
137+
138+ def unescape (s ):
139+ """
140+ Convert all named and numeric character references (e.g. >, >,
141+ &x3e;) in the string s to the corresponding unicode characters.
142+ This function uses the rules defined by the HTML 5 standard
143+ for both valid and invalid character references, and the list of
144+ HTML 5 named character references defined in html.entities.html5.
145+ """
146+ if '&' not in s :
147+ return s
148+ return _charref .sub (_replace_charref , s )
149+
150+ __all__ = ['escape' , 'unescape' ]
0 commit comments