5151}
5252# tags whoose content will be completely removed (recursively)
5353# (overrides tags_to_keep and tags_to_replace)
54- _TAGS_TO_PURGE = ('script' , 'img' , 'input' )
54+ _TAGS_TO_PURGE = ('script' , 'style' , 'img' , 'input' )
55+ # tags that are automatically closed in HTML4 and HTML5
56+ _VOID_TAGS = frozenset ([
57+ 'area' , 'base' , 'br' , 'col' , 'embed' , 'hr' , 'img' , 'input' , 'keygen' ,
58+ 'link' , 'meta' , 'param' , 'source' , 'track' , 'wbr'
59+ ])
5560
5661
5762def htmlregion (text ):
@@ -104,7 +109,7 @@ def text(region):
104109
105110
106111def safehtml (region , allowed_tags = _TAGS_TO_KEEP , replace_tags = _TAGS_TO_REPLACE ,
107- tags_to_purge = _TAGS_TO_PURGE ):
112+ tags_to_purge = _TAGS_TO_PURGE ):
108113 """Creates an HTML subset, using a whitelist of HTML tags.
109114
110115 The HTML generated is safe for display on a website,without escaping and
@@ -117,16 +122,16 @@ def safehtml(region, allowed_tags=_TAGS_TO_KEEP, replace_tags=_TAGS_TO_REPLACE,
117122 opening and closing tag is removed.
118123
119124 For example:
120- >>> t = lambda s: safehtml(htmlregion(s))
125+ >>> t = lambda s, keep=_TAGS_TO_KEEP : safehtml(htmlregion(s), keep )
121126 >>> t(u'<strong>test <blink>test</blink></strong>')
122127 u'<strong>test test</strong>'
123128
124129 Some tags, like script, are completely removed
125130 >>> t(u'<script>test </script>test')
126131 u'test'
127132
128- replace_tags define tags that are converted. By default all headers, bold and indenting
129- are converted to strong and em.
133+ replace_tags define tags that are converted. By default all headers, bold
134+ and indenting are converted to strong and em.
130135 >>> t(u'<h2>header</h2> test <b>bold</b> <i>indent</i>')
131136 u'<strong>header</strong> test <strong>bold</strong> <em>indent</em>'
132137
@@ -145,14 +150,27 @@ def safehtml(region, allowed_tags=_TAGS_TO_KEEP, replace_tags=_TAGS_TO_REPLACE,
145150 >>> t(u'<p>test <i><br/><b>test</p>')
146151 u'<p>test <em><br/><strong>test</strong></em></p>'
147152
153+ Include or exclude tags that you want
154+ >>> t(u'Keep <meta name="name" content="data"> and <b><hr> tags')
155+ u'Keep and <strong> tags</strong>'
156+ >>> tags = set(list(_TAGS_TO_KEEP)[:] + ['meta', 'hr'])
157+ >>> t(u'Keep <meta name="name" content="data"> and <b><hr> tags', tags)
158+ u'Keep <meta> and <strong><hr> tags</strong>'
159+
160+ Handle void tags when purged
161+ >>> t(u'Keep content around <img src="image.jpg"> <b>img</b> tag')
162+ u'Keep content around <strong>img</strong> tag'
163+
148164 """
149165 tagstack = []
166+
150167 def _process_tag (tag ):
151168 tagstr = replace_tags .get (tag .tag , tag .tag )
152169 if tagstr not in allowed_tags :
153170 return
154171 if tag .tag_type == HtmlTagType .OPEN_TAG :
155- tagstack .append (tagstr )
172+ if tag .tag not in _VOID_TAGS :
173+ tagstack .append (tagstr )
156174 return u"<%s>" % tagstr
157175 elif tag .tag_type == HtmlTagType .CLOSE_TAG :
158176 try :
@@ -188,7 +206,8 @@ def _process_markup(region, textf, tagf, tags_to_purge=_TAGS_TO_PURGE):
188206 tag = fragment .tag
189207 if tag in tags_to_purge :
190208 # if opening, keep going until closed
191- if fragment .tag_type == HtmlTagType .OPEN_TAG :
209+ if (fragment .tag_type == HtmlTagType .OPEN_TAG and
210+ tag not in _VOID_TAGS ):
192211 for probe in fiter :
193212 if isinstance (probe , HtmlTag ) and \
194213 probe .tag == tag and \
0 commit comments