Skip to content

Commit c8adbb3

Browse files
committed
allow to annotate attributes with name "content" in an unambiguous way
and with complete back compatibility, by allowing to specify which annotation key is used for the tag content, fallbacking to the default "content" if the "text-content" annotation meta attribute is not present
1 parent 4bbc3c7 commit c8adbb3

File tree

2 files changed

+32
-2
lines changed

2 files changed

+32
-2
lines changed

scrapely/extraction/pageparsing.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -125,8 +125,9 @@ def _handle_unpaired_tag(self, html_tag):
125125

126126
annotation = AnnotationTag(self.next_tag_index, self.next_tag_index + 1)
127127
attribute_annotations = jannotation.pop('annotations', {}).items()
128+
content_key = jannotation.pop('text-content', 'content')
128129
for extract_attribute, tag_value in attribute_annotations:
129-
if extract_attribute == 'content':
130+
if extract_attribute == content_key:
130131
annotation.surrounds_attribute = tag_value
131132
self.unpairedtag_stack.append(annotation)
132133
else:
@@ -200,8 +201,9 @@ def _handle_open_tag(self, html_tag):
200201
self.extra_required_attrs.extend(jannotation.pop('required', []))
201202

202203
attribute_annotations = jannotation.pop('annotations', {}).items()
204+
content_key = jannotation.pop('text-content', 'content')
203205
for extract_attribute, tag_value in attribute_annotations:
204-
if extract_attribute == 'content':
206+
if extract_attribute == content_key:
205207
annotation.surrounds_attribute = tag_value
206208
else:
207209
annotation.tag_attributes.append((extract_attribute, tag_value))

scrapely/tests/test_pageparsing.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,18 @@
178178
</body></html>
179179
"""
180180

181+
LABELLED_PAGE12 = u"""
182+
<head>
183+
<meta name="description" content="This is the description" data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;generated&quot;: false, &quot;text-content&quot;: &quot;text-content:&quot;, &quot;annotations&quot;: {&quot;content&quot;: &quot;description&quot;}}" />
184+
</head>
185+
"""
186+
187+
LABELLED_PAGE13 = u"""
188+
<head>
189+
<meta name="description" content="This is the description" data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;generated&quot;: false, &quot;text-content&quot;: &quot;text-content&quot;, &quot;annotations&quot;: {&quot;content&quot;: &quot;description&quot;, &quot;text-content&quot;: &quot;name&quot;}}">This is the name</meta>
190+
</head>
191+
"""
192+
181193
def _parse_page(parser_class, pagetext):
182194
htmlpage = HtmlPage(None, {}, pagetext)
183195
parser = parser_class(TokenDict())
@@ -304,6 +316,22 @@ def test_variant_attribute(self):
304316
annotations = _parse_page(TemplatePageParser, LABELLED_PAGE11).annotations
305317
self.assertEqual(annotations[0].variant_id, 1)
306318

319+
def test_content_attribute(self):
320+
"""
321+
Test that attribute with name content is unambiguously interpreted
322+
"""
323+
annotations = _parse_page(TemplatePageParser, LABELLED_PAGE12).annotations
324+
self.assertEqual(annotations[0].surrounds_attribute, None)
325+
self.assertEqual(annotations[0].tag_attributes, [("content", "description")])
326+
327+
def test_content_and_content_attribute(self):
328+
"""
329+
Test that attribute with name content and the content itself are unambiguously interpreted
330+
"""
331+
annotations = _parse_page(TemplatePageParser, LABELLED_PAGE13).annotations
332+
self.assertEqual(annotations[0].surrounds_attribute, 'name')
333+
self.assertEqual(annotations[0].tag_attributes, [("content", "description")])
334+
307335
def test_site_pages(self):
308336
"""
309337
Tests from real pages. More reliable and easy to build for more complicated structures

0 commit comments

Comments
 (0)