|
18 | 18 | from urllib.parse import urljoin |
19 | 19 |
|
20 | 20 | import lxml.etree |
| 21 | +from lxml.html.clean import Cleaner |
21 | 22 | from w3lib.html import strip_html5_whitespace |
| 23 | +import html_text |
22 | 24 |
|
23 | 25 | from extruct.utils import parse_html |
24 | 26 |
|
25 | 27 |
|
| 28 | +# Cleaner which is similar to html_text cleaner, but is less aggressive |
| 29 | +cleaner = Cleaner( |
| 30 | + scripts=True, |
| 31 | + javascript=False, # onclick attributes are fine |
| 32 | + comments=True, |
| 33 | + style=True, |
| 34 | + links=True, |
| 35 | + meta=True, |
| 36 | + page_structure=False, # <title> may be nice to have |
| 37 | + processing_instructions=True, |
| 38 | + embedded=False, # keep embedded content |
| 39 | + frames=False, # keep frames |
| 40 | + forms=False, # keep forms |
| 41 | + annoying_tags=False, |
| 42 | + remove_unknown_tags=False, |
| 43 | + safe_attrs_only=False, |
| 44 | +) |
| 45 | + |
| 46 | + |
26 | 47 | class LxmlMicrodataExtractor(object): |
27 | 48 | _xp_item = lxml.etree.XPath('descendant-or-self::*[@itemscope]') |
28 | 49 | _xp_prop = lxml.etree.XPath("""set:difference(.//*[@itemprop], |
@@ -182,7 +203,8 @@ def _extract_property_value(self, node, items_seen, base_url, force=False): |
182 | 203 | return self._extract_textContent(node) |
183 | 204 |
|
184 | 205 | def _extract_textContent(self, node): |
185 | | - return u"".join(self._xp_clean_text(node)).strip() |
| 206 | + clean_node = cleaner.clean_html(node) |
| 207 | + return html_text.etree_to_text(clean_node) |
186 | 208 |
|
187 | 209 |
|
188 | 210 | MicrodataExtractor = LxmlMicrodataExtractor |
0 commit comments