Skip to content

Commit d292a6e

Browse files
committed
Merge pull request #29 from kalessin/contentattr
allow annotating attributes named "content"
2 parents 4bbc3c7 + c8adbb3 commit d292a6e

File tree

2 files changed

+32
-2
lines changed

2 files changed

+32
-2
lines changed

scrapely/extraction/pageparsing.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -125,8 +125,9 @@ def _handle_unpaired_tag(self, html_tag):
125125

126126
annotation = AnnotationTag(self.next_tag_index, self.next_tag_index + 1)
127127
attribute_annotations = jannotation.pop('annotations', {}).items()
128+
content_key = jannotation.pop('text-content', 'content')
128129
for extract_attribute, tag_value in attribute_annotations:
129-
if extract_attribute == 'content':
130+
if extract_attribute == content_key:
130131
annotation.surrounds_attribute = tag_value
131132
self.unpairedtag_stack.append(annotation)
132133
else:
@@ -200,8 +201,9 @@ def _handle_open_tag(self, html_tag):
200201
self.extra_required_attrs.extend(jannotation.pop('required', []))
201202

202203
attribute_annotations = jannotation.pop('annotations', {}).items()
204+
content_key = jannotation.pop('text-content', 'content')
203205
for extract_attribute, tag_value in attribute_annotations:
204-
if extract_attribute == 'content':
206+
if extract_attribute == content_key:
205207
annotation.surrounds_attribute = tag_value
206208
else:
207209
annotation.tag_attributes.append((extract_attribute, tag_value))

scrapely/tests/test_pageparsing.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,18 @@
178178
</body></html>
179179
"""
180180

181+
LABELLED_PAGE12 = u"""
182+
<head>
183+
<meta name="description" content="This is the description" data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;generated&quot;: false, &quot;text-content&quot;: &quot;text-content:&quot;, &quot;annotations&quot;: {&quot;content&quot;: &quot;description&quot;}}" />
184+
</head>
185+
"""
186+
187+
LABELLED_PAGE13 = u"""
188+
<head>
189+
<meta name="description" content="This is the description" data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;generated&quot;: false, &quot;text-content&quot;: &quot;text-content&quot;, &quot;annotations&quot;: {&quot;content&quot;: &quot;description&quot;, &quot;text-content&quot;: &quot;name&quot;}}">This is the name</meta>
190+
</head>
191+
"""
192+
181193
def _parse_page(parser_class, pagetext):
182194
htmlpage = HtmlPage(None, {}, pagetext)
183195
parser = parser_class(TokenDict())
@@ -304,6 +316,22 @@ def test_variant_attribute(self):
304316
annotations = _parse_page(TemplatePageParser, LABELLED_PAGE11).annotations
305317
self.assertEqual(annotations[0].variant_id, 1)
306318

319+
def test_content_attribute(self):
320+
"""
321+
Test that attribute with name content is unambiguously interpreted
322+
"""
323+
annotations = _parse_page(TemplatePageParser, LABELLED_PAGE12).annotations
324+
self.assertEqual(annotations[0].surrounds_attribute, None)
325+
self.assertEqual(annotations[0].tag_attributes, [("content", "description")])
326+
327+
def test_content_and_content_attribute(self):
328+
"""
329+
Test that attribute with name content and the content itself are unambiguously interpreted
330+
"""
331+
annotations = _parse_page(TemplatePageParser, LABELLED_PAGE13).annotations
332+
self.assertEqual(annotations[0].surrounds_attribute, 'name')
333+
self.assertEqual(annotations[0].tag_attributes, [("content", "description")])
334+
307335
def test_site_pages(self):
308336
"""
309337
Tests from real pages. More reliable and easy to build for more complicated structures

0 commit comments

Comments
 (0)