Skip to content

Commit c2878f1

Browse files
committed
Handle void tags when processing as safehtml #92
1 parent 91fc620 commit c2878f1

File tree

1 file changed

+26
-7
lines changed

1 file changed

+26
-7
lines changed

scrapely/extractors.py

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,12 @@
5151
}
5252
# tags whoose content will be completely removed (recursively)
5353
# (overrides tags_to_keep and tags_to_replace)
54-
_TAGS_TO_PURGE = ('script', 'img', 'input')
54+
_TAGS_TO_PURGE = ('script', 'style', 'img', 'input')
55+
# tags that are automatically closed in HTML4 and HTML5
56+
_VOID_TAGS = frozenset([
57+
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen',
58+
'link', 'meta', 'param', 'source', 'track', 'wbr'
59+
])
5560

5661

5762
def htmlregion(text):
@@ -104,7 +109,7 @@ def text(region):
104109

105110

106111
def safehtml(region, allowed_tags=_TAGS_TO_KEEP, replace_tags=_TAGS_TO_REPLACE,
107-
tags_to_purge=_TAGS_TO_PURGE):
112+
tags_to_purge=_TAGS_TO_PURGE):
108113
"""Creates an HTML subset, using a whitelist of HTML tags.
109114
110115
The HTML generated is safe for display on a website,without escaping and
@@ -117,16 +122,16 @@ def safehtml(region, allowed_tags=_TAGS_TO_KEEP, replace_tags=_TAGS_TO_REPLACE,
117122
opening and closing tag is removed.
118123
119124
For example:
120-
>>> t = lambda s: safehtml(htmlregion(s))
125+
>>> t = lambda s, keep=_TAGS_TO_KEEP: safehtml(htmlregion(s), keep)
121126
>>> t(u'<strong>test <blink>test</blink></strong>')
122127
u'<strong>test test</strong>'
123128
124129
Some tags, like script, are completely removed
125130
>>> t(u'<script>test </script>test')
126131
u'test'
127132
128-
replace_tags define tags that are converted. By default all headers, bold and indenting
129-
are converted to strong and em.
133+
replace_tags define tags that are converted. By default all headers, bold
134+
and indenting are converted to strong and em.
130135
>>> t(u'<h2>header</h2> test <b>bold</b> <i>indent</i>')
131136
u'<strong>header</strong> test <strong>bold</strong> <em>indent</em>'
132137
@@ -145,14 +150,27 @@ def safehtml(region, allowed_tags=_TAGS_TO_KEEP, replace_tags=_TAGS_TO_REPLACE,
145150
>>> t(u'<p>test <i><br/><b>test</p>')
146151
u'<p>test <em><br/><strong>test</strong></em></p>'
147152
153+
Include or exclude tags that you want
154+
>>> t(u'Keep <meta name="name" content="data"> and <b><hr> tags')
155+
u'Keep and <strong> tags</strong>'
156+
>>> tags = set(list(_TAGS_TO_KEEP)[:] + ['meta', 'hr'])
157+
>>> t(u'Keep <meta name="name" content="data"> and <b><hr> tags', tags)
158+
u'Keep <meta> and <strong><hr> tags</strong>'
159+
160+
Handle void tags when purged
161+
>>> t(u'Keep content around <img src="image.jpg"> <b>img</b> tag')
162+
u'Keep content around <strong>img</strong> tag'
163+
148164
"""
149165
tagstack = []
166+
150167
def _process_tag(tag):
151168
tagstr = replace_tags.get(tag.tag, tag.tag)
152169
if tagstr not in allowed_tags:
153170
return
154171
if tag.tag_type == HtmlTagType.OPEN_TAG:
155-
tagstack.append(tagstr)
172+
if tag.tag not in _VOID_TAGS:
173+
tagstack.append(tagstr)
156174
return u"<%s>" % tagstr
157175
elif tag.tag_type == HtmlTagType.CLOSE_TAG:
158176
try:
@@ -188,7 +206,8 @@ def _process_markup(region, textf, tagf, tags_to_purge=_TAGS_TO_PURGE):
188206
tag = fragment.tag
189207
if tag in tags_to_purge:
190208
# if opening, keep going until closed
191-
if fragment.tag_type == HtmlTagType.OPEN_TAG:
209+
if (fragment.tag_type == HtmlTagType.OPEN_TAG and
210+
tag not in _VOID_TAGS):
192211
for probe in fiter:
193212
if isinstance(probe, HtmlTag) and \
194213
probe.tag == tag and \

0 commit comments

Comments
 (0)