Skip to content

Commit 3adbe30

Browse files
committed
Merge pull request #36 from kalessin/xmldeclaration
refactor text extractor and ignore xml declarations
2 parents 739f09b + 8d02188 commit 3adbe30

File tree

3 files changed

+12
-7
lines changed

3 files changed

+12
-7
lines changed

scrapely/extractors.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,6 @@
4747
'b' : 'strong',
4848
'i' : 'em',
4949
}
50-
5150
# tags whoose content will be completely removed (recursively)
5251
# (overrides tags_to_keep and tags_to_replace)
5352
_TAGS_TO_PURGE = ('script', 'img', 'input')
@@ -91,12 +90,11 @@ def text(region):
9190
HTML entities are converted to text
9291
>>> t(u"only £42")
9392
u'only \\xa342'
93+
94+
>>> t(u"<p>The text</p><?xml:namespace blabla/><p>is here</p>")
95+
u'The text is here'
9496
"""
95-
chunks = _process_markup(region,
96-
lambda text: remove_entities(text, encoding=region.htmlpage.encoding),
97-
lambda tag: u' '
98-
)
99-
text = u''.join(chunks)
97+
text = remove_entities(region.text_content, encoding=region.htmlpage.encoding)
10098
return _WS.sub(u' ', text).strip()
10199

102100
def safehtml(region, allowed_tags=_TAGS_TO_KEEP, replace_tags=_TAGS_TO_REPLACE,

scrapely/htmlpage.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,7 @@ def __repr__(self):
200200
_TAG = "<(\/?)(\w+(?::\w+)?)((?:\s*" + _ATTR + ")+\s*|\s*)(\/?)>?"
201201
_DOCTYPE = r"<!DOCTYPE.*?>"
202202
_SCRIPT = "(<script.*?>)(.*?)(</script.*?>)"
203-
_COMMENT = "(<!--.*?-->)"
203+
_COMMENT = "(<!--.*?-->|<\?.+?>)"
204204

205205
_ATTR_REGEXP = re.compile(_ATTR, re.I | re.DOTALL)
206206
_HTML_REGEXP = re.compile("%s|%s|%s" % (_COMMENT, _SCRIPT, _TAG), re.I | re.DOTALL)

scrapely/tests/test_htmlpage.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,3 +138,10 @@ def test_malformed2(self):
138138
def test_empty_subregion(self):
139139
htmlpage = HtmlPage(body=u"")
140140
self.assertEqual(htmlpage.subregion(), u"")
141+
142+
def test_ignore_xml_declaration(self):
143+
"""Ignore xml declarations inside html"""
144+
parsed = list(parse_html(u"<p>The text</p><?xml:namespace blabla/><p>is here</p>"))
145+
self.assertFalse(parsed[3].is_text_content)
146+
147+

0 commit comments

Comments
 (0)