Merge pull request #119 from scrapinghub/fix-incorrectly-formatted-description-property

jakubwasikowski · web-flow · commit 6df8e1908882 · 2019-07-19T13:34:55.000+02:00
Fix incorrectly formatted description property
diff --git a/extruct/w3cmicrodata.py b/extruct/w3cmicrodata.py
@@ -18,11 +18,32 @@
     from urllib.parse import urljoin
 
 import lxml.etree
+from lxml.html.clean import Cleaner
 from w3lib.html import strip_html5_whitespace
+import html_text
 
 from extruct.utils import parse_html
 
 
+# Cleaner which is similar to html_text cleaner, but is less aggressive
+cleaner = Cleaner(
+    scripts=True,
+    javascript=False,  # onclick attributes are fine
+    comments=True,
+    style=True,
+    links=True,
+    meta=True,
+    page_structure=False,  # <title> may be nice to have
+    processing_instructions=True,
+    embedded=False,  # keep embedded content
+    frames=False,  # keep frames
+    forms=False,  # keep forms
+    annoying_tags=False,
+    remove_unknown_tags=False,
+    safe_attrs_only=False,
+)
+
+
 class LxmlMicrodataExtractor(object):
     _xp_item = lxml.etree.XPath('descendant-or-self::*[@itemscope]')
     _xp_prop = lxml.etree.XPath("""set:difference(.//*[@itemprop],
@@ -182,7 +203,8 @@ def _extract_property_value(self, node, items_seen, base_url, force=False):
             return self._extract_textContent(node)
 
     def _extract_textContent(self, node):
-        return u"".join(self._xp_clean_text(node)).strip()
+        clean_node = cleaner.clean_html(node)
+        return html_text.etree_to_text(clean_node)
 
 
 MicrodataExtractor = LxmlMicrodataExtractor
diff --git a/requirements.txt b/requirements.txt
@@ -7,5 +7,6 @@ requests
 rdflib
 rdflib-jsonld
 mf2py>=1.1.0
-six
+six>=1.11
 w3lib
+html-text
diff --git a/setup.py b/setup.py
@@ -33,6 +33,7 @@ def get_version():
                       'rdflib-jsonld', 
                       'mf2py', 
                       'w3lib',
+                      'html-text>=0.5.1',
                       'six'],
     extras_require={
         'service': [
diff --git a/tests/samples/schema.org/Event.002.json b/tests/samples/schema.org/Event.002.json
@@ -37,7 +37,7 @@
                                            "offers": "foo-fighters-everlong-buy.html",
                                            "url": "foo-fighters-everlong.html"},
                             "type": "http://schema.org/MusicRecording"}],
-                 "video": {"properties": {"description": "Catch this exclusive interview with\n    Dave Grohl and the Foo Fighters about their new album, Rope.",
+                 "video": {"properties": {"description": "Catch this exclusive interview with Dave Grohl and the Foo Fighters about their new album, Rope.",
                                            "duration": "T1M33S",
                                            "name": "Interview with the Foo Fighters",
                                            "thumbnail": "foo-fighters-interview-thumb.jpg"},
diff --git a/tests/samples/schema.org/MusicRecording.001.json b/tests/samples/schema.org/MusicRecording.001.json
@@ -37,7 +37,7 @@
                                            "offers": "foo-fighters-everlong-buy.html",
                                            "url": "foo-fighters-everlong.html"},
                             "type": "http://schema.org/MusicRecording"}],
-                 "video": {"properties": {"description": "Catch this exclusive interview with\n    Dave Grohl and the Foo Fighters about their new album, Rope.",
+                 "video": {"properties": {"description": "Catch this exclusive interview with Dave Grohl and the Foo Fighters about their new album, Rope.",
                                           "duration": "T1M33S",
                                           "name": "Interview with the Foo Fighters",
                                           "thumbnail": "foo-fighters-interview-thumb.jpg"},
diff --git a/tests/samples/schema.org/product-ref.json b/tests/samples/schema.org/product-ref.json
@@ -32,7 +32,7 @@
             ],
             "brand": "ACME",
             "name": "Executive Anvil",
-            "description": "Sleeker than ACME's Classic Anvil, the\n      Executive Anvil is perfect for the business traveler\n      looking for something to drop from a height.",
+            "description": "Sleeker than ACME's Classic Anvil, the Executive Anvil is perfect for the business traveler looking for something to drop from a height.",
             "mpn": "925872",
             "aggregateRating": {
                 "type": "http://schema.org/AggregateRating",
diff --git a/tests/samples/schema.org/product.json b/tests/samples/schema.org/product.json
@@ -2,7 +2,7 @@
   "properties": {"brand": "ACME",
                  "name": "Executive Anvil",
                  "image": "anvil_executive.jpg",
-                 "description": "Sleeker than ACME's Classic Anvil, the\n      Executive Anvil is perfect for the business traveler\n      looking for something to drop from a height.",
+                 "description": "Sleeker than ACME's Classic Anvil, the Executive Anvil is perfect for the business traveler looking for something to drop from a height.",
                  "mpn": "925872",
                  "aggregateRating": {"type": "http://schema.org/AggregateRating",
                                      "properties": {"ratingValue": "4.4",
diff --git a/tests/samples/schema.org/product_custom_url.json b/tests/samples/schema.org/product_custom_url.json
@@ -2,7 +2,7 @@
   "properties": {"brand": "ACME",
                  "name": "Executive Anvil",
                  "image": "http://some-example.com/anvil_executive.jpg",
-                 "description": "Sleeker than ACME's Classic Anvil, the\n      Executive Anvil is perfect for the business traveler\n      looking for something to drop from a height.",
+                 "description": "Sleeker than ACME's Classic Anvil, the Executive Anvil is perfect for the business traveler looking for something to drop from a height.",
                  "mpn": "925872",
                  "aggregateRating": {"type": "http://schema.org/AggregateRating",
                                      "properties": {"ratingValue": "4.4",
diff --git a/tests/samples/schema.org/product_custom_url_and_node_id.json b/tests/samples/schema.org/product_custom_url_and_node_id.json
@@ -3,7 +3,7 @@
   "properties": {"brand": "ACME",
                  "name": "Executive Anvil",
                  "image": "http://some-example.com/anvil_executive.jpg",
-                 "description": "Sleeker than ACME's Classic Anvil, the\n      Executive Anvil is perfect for the business traveler\n      looking for something to drop from a height.",
+                 "description": "Sleeker than ACME's Classic Anvil, the Executive Anvil is perfect for the business traveler looking for something to drop from a height.",
                  "mpn": "925872",
                  "aggregateRating": {"type": "http://schema.org/AggregateRating",
                                      "_nodeId_": "aggregateRating",
diff --git a/tests/samples/w3c/microdata.5.2.withtext.json b/tests/samples/w3c/microdata.5.2.withtext.json
@@ -2,18 +2,18 @@
                  "name": "Tank Locomotive (DB 80)",
                  "product-code": "33041",
                  "scale": "HO"},
-  "textContent": "Name:\n Tank Locomotive (DB 80)\n Product code:\n 33041\n Scale:\n HO\n Digital:\n Delta",
+  "textContent": "Name:\nTank Locomotive (DB 80)\nProduct code:\n33041\nScale:\nHO\nDigital:\nDelta",
   "type": ["http://md.example.com/loco",
            "http://md.example.com/lighting"]},
  {"properties": {"name": "Turnout Lantern Kit",
                  "product-code": "74470",
                  "scale": "HO",
                  "track-type": "C"},
-  "textContent": "Name:\n Turnout Lantern Kit\n Product code:\n 74470\n Purpose:\n For retrofitting 2 C Track\n turnouts.",
+  "textContent": "Name:\nTurnout Lantern Kit\nProduct code:\n74470\nPurpose:\nFor retrofitting 2 C Track turnouts.",
   "type": ["http://md.example.com/track",
            "http://md.example.com/lighting"]},
  {"properties": {"name": "Express Train Passenger Car (DB Am 203)",
                  "product-code": "8710",
                  "scale": "Z"},
-  "textContent": "Name:\n Express Train Passenger Car (DB Am 203)\n Product code:\n 8710\n Scale:\n Z",
+  "textContent": "Name:\nExpress Train Passenger Car (DB Am 203)\nProduct code:\n8710\nScale:\nZ",
   "type": "http://md.example.com/passengers"}]
diff --git a/tests/samples/websites/microdata-with-description.html b/tests/samples/websites/microdata-with-description.html
diff --git a/tests/samples/websites/microdata-with-description.json b/tests/samples/websites/microdata-with-description.json
diff --git a/tests/test_microdata.py b/tests/test_microdata.py
diff --git a/tests/test_uniform.py b/tests/test_uniform.py