Merge pull request #36 from kalessin/xmldeclaration

pablohoffman · pablohoffman · commit 3adbe3025c48 · 2013-04-10T09:11:28.000-07:00
refactor text extractor and ignore xml declarations
diff --git a/scrapely/extractors.py b/scrapely/extractors.py
@@ -47,7 +47,6 @@
     'b' : 'strong',
     'i' : 'em',
 }
-
 # tags whoose content will be completely removed (recursively)
 # (overrides tags_to_keep and tags_to_replace)
 _TAGS_TO_PURGE = ('script', 'img', 'input')
@@ -91,12 +90,11 @@ def text(region):
     HTML entities are converted to text
     >>> t(u"only &pound;42")
     u'only \\xa342'
+
+    >>> t(u"<p>The text</p><?xml:namespace blabla/><p>is here</p>")
+    u'The text is here'
     """
-    chunks = _process_markup(region, 
-        lambda text: remove_entities(text, encoding=region.htmlpage.encoding),
-        lambda tag: u' '
-    )
-    text = u''.join(chunks)
+    text = remove_entities(region.text_content, encoding=region.htmlpage.encoding)
     return _WS.sub(u' ', text).strip()
 
 def safehtml(region, allowed_tags=_TAGS_TO_KEEP, replace_tags=_TAGS_TO_REPLACE,
diff --git a/scrapely/htmlpage.py b/scrapely/htmlpage.py
@@ -200,7 +200,7 @@ def __repr__(self):
 _TAG = "<(\/?)(\w+(?::\w+)?)((?:\s*" + _ATTR + ")+\s*|\s*)(\/?)>?"
 _DOCTYPE = r"<!DOCTYPE.*?>"
 _SCRIPT = "(<script.*?>)(.*?)(</script.*?>)"
-_COMMENT = "(<!--.*?-->)"
+_COMMENT = "(<!--.*?-->|<\?.+?>)"
 
 _ATTR_REGEXP = re.compile(_ATTR, re.I | re.DOTALL)
 _HTML_REGEXP = re.compile("%s|%s|%s" % (_COMMENT, _SCRIPT, _TAG), re.I | re.DOTALL)
diff --git a/scrapely/tests/test_htmlpage.py b/scrapely/tests/test_htmlpage.py
@@ -138,3 +138,10 @@ def test_malformed2(self):
     def test_empty_subregion(self):
         htmlpage = HtmlPage(body=u"")
         self.assertEqual(htmlpage.subregion(), u"")
+
+    def test_ignore_xml_declaration(self):
+        """Ignore xml declarations inside html"""
+        parsed = list(parse_html(u"<p>The text</p><?xml:namespace blabla/><p>is here</p>"))
+        self.assertFalse(parsed[3].is_text_content)
+
+