22
33import re
44from xml .sax .saxutils import escape , unescape
5+ from six .moves import urllib_parse as urlparse
56
67from .tokenizer import HTMLTokenizer
78from .constants import tokenTypes
89
910
11+ content_type_rgx = re .compile (r'''
12+ ^
13+ # Match a content type <application>/<type>
14+ (?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
15+ # Match any character set and encoding
16+ (?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
17+ |(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
18+ # Assume the rest is data
19+ ,.*
20+ $
21+ ''' ,
22+ re .VERBOSE )
23+
24+
1025class HTMLSanitizerMixin (object ):
1126 """ sanitization of XHTML+MathML+SVG and of inline style attributes."""
1227
@@ -138,7 +153,9 @@ class HTMLSanitizerMixin(object):
138153 acceptable_protocols = ['ed2k' , 'ftp' , 'http' , 'https' , 'irc' ,
139154 'mailto' , 'news' , 'gopher' , 'nntp' , 'telnet' , 'webcal' ,
140155 'xmpp' , 'callto' , 'feed' , 'urn' , 'aim' , 'rsync' , 'tag' ,
141- 'ssh' , 'sftp' , 'rtsp' , 'afs' ]
156+ 'ssh' , 'sftp' , 'rtsp' , 'afs' , 'data' ]
157+
158+ acceptable_content_types = ['image/png' , 'image/jpeg' , 'image/gif' , 'image/webp' , 'image/bmp' , 'text/plain' ]
142159
143160 # subclasses may define their own versions of these constants
144161 allowed_elements = acceptable_elements + mathml_elements + svg_elements
@@ -147,6 +164,7 @@ class HTMLSanitizerMixin(object):
147164 allowed_css_keywords = acceptable_css_keywords
148165 allowed_svg_properties = acceptable_svg_properties
149166 allowed_protocols = acceptable_protocols
167+ allowed_content_types = acceptable_content_types
150168
151169 # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
152170 # stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
@@ -189,10 +207,17 @@ def allowed_token(self, token, token_type):
189207 unescape (attrs [attr ])).lower ()
190208 # remove replacement characters from unescaped characters
191209 val_unescaped = val_unescaped .replace ("\ufffd " , "" )
192- if (re .match ("^[a-z0-9][-+.a-z0-9]*:" , val_unescaped ) and
193- (val_unescaped .split (':' )[0 ] not in
194- self .allowed_protocols )):
195- del attrs [attr ]
210+ uri = urlparse .urlparse (val_unescaped )
211+ if uri :
212+ if uri .scheme not in self .allowed_protocols :
213+ del attrs [attr ]
214+ if uri .scheme == 'data' :
215+ m = content_type_rgx .match (uri .path )
216+ if not m :
217+ del attrs [attr ]
218+ if m .group ('content_type' ) not in self .allowed_content_types :
219+ del attrs [attr ]
220+
196221 for attr in self .svg_attr_val_allows_ref :
197222 if attr in attrs :
198223 attrs [attr ] = re .sub (r'url\s*\(\s*[^#\s][^)]+?\)' ,
0 commit comments