allow to annotate attributes with name "content" in an unambiguous way

kalessin · kalessin · commit c8adbb38c6b0 · 2012-10-03T23:44:12.000-02:00
and with complete back compatibility, by allowing to specify which
annotation key is used for the tag content, fallbacking to the default
"content" if the "text-content" annotation meta attribute is not present
diff --git a/scrapely/extraction/pageparsing.py b/scrapely/extraction/pageparsing.py
@@ -125,8 +125,9 @@ def _handle_unpaired_tag(self, html_tag):
                 
             annotation = AnnotationTag(self.next_tag_index, self.next_tag_index + 1)
             attribute_annotations = jannotation.pop('annotations', {}).items()
+            content_key = jannotation.pop('text-content', 'content')
             for extract_attribute, tag_value in attribute_annotations:
-                if extract_attribute == 'content':
+                if extract_attribute == content_key:
                     annotation.surrounds_attribute = tag_value
                     self.unpairedtag_stack.append(annotation)
                 else:
@@ -200,8 +201,9 @@ def _handle_open_tag(self, html_tag):
         self.extra_required_attrs.extend(jannotation.pop('required', []))
         
         attribute_annotations = jannotation.pop('annotations', {}).items()
+        content_key = jannotation.pop('text-content', 'content')
         for extract_attribute, tag_value in attribute_annotations:
-            if extract_attribute == 'content':
+            if extract_attribute == content_key:
                 annotation.surrounds_attribute = tag_value
             else:
                 annotation.tag_attributes.append((extract_attribute, tag_value))
diff --git a/scrapely/tests/test_pageparsing.py b/scrapely/tests/test_pageparsing.py
@@ -178,6 +178,18 @@
 </body></html>
 """
 
+LABELLED_PAGE12 = u"""
+<head>
+<meta name="description" content="This is the description" data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;generated&quot;: false, &quot;text-content&quot;: &quot;text-content:&quot;, &quot;annotations&quot;: {&quot;content&quot;: &quot;description&quot;}}" />
+</head>
+"""
+
+LABELLED_PAGE13 = u"""
+<head>
+<meta name="description" content="This is the description" data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;generated&quot;: false, &quot;text-content&quot;: &quot;text-content&quot;, &quot;annotations&quot;: {&quot;content&quot;: &quot;description&quot;, &quot;text-content&quot;: &quot;name&quot;}}">This is the name</meta>
+</head>
+"""
+
 def _parse_page(parser_class, pagetext):
     htmlpage = HtmlPage(None, {}, pagetext)
     parser = parser_class(TokenDict())
@@ -304,6 +316,22 @@ def test_variant_attribute(self):
         annotations = _parse_page(TemplatePageParser, LABELLED_PAGE11).annotations
         self.assertEqual(annotations[0].variant_id, 1)
 
+    def test_content_attribute(self):
+        """
+        Test that attribute with name content is unambiguously interpreted
+        """
+        annotations = _parse_page(TemplatePageParser, LABELLED_PAGE12).annotations
+        self.assertEqual(annotations[0].surrounds_attribute, None)
+        self.assertEqual(annotations[0].tag_attributes, [("content", "description")])
+
+    def test_content_and_content_attribute(self):
+        """
+        Test that attribute with name content and the content itself are unambiguously interpreted
+        """
+        annotations = _parse_page(TemplatePageParser, LABELLED_PAGE13).annotations
+        self.assertEqual(annotations[0].surrounds_attribute, 'name')
+        self.assertEqual(annotations[0].tag_attributes, [("content", "description")])
+
     def test_site_pages(self):
         """
         Tests from real pages. More reliable and easy to build for more complicated structures