Skip to content

Commit 5652384

Browse files
committed
Merge pull request #37 from kalessin/replacement
correctly handle tag name replacement when replaced tags are not closed.
2 parents 3adbe30 + 087d6d4 commit 5652384

File tree

2 files changed

+80
-21
lines changed

2 files changed

+80
-21
lines changed

scrapely/extraction/pageparsing.py

Lines changed: 43 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,14 @@ def handle_tag(self, html_tag, index):
6666
pass
6767

6868
_END_UNPAIREDTAG_TAGS = ["form", "div", "p", "table", "tr", "td"]
69-
69+
_AUTO_CLOSE_TAGS_ON_OPEN = {
70+
# the given keys closes the tags in the list
71+
"p": ["p"],
72+
"option": ["option"],
73+
}
74+
_AUTO_CLOSE_TAGS_ON_CLOSE = {
75+
"select": ["option"],
76+
}
7077
class TemplatePageParser(InstanceLearningParser):
7178
"""Template parsing for instance based learning algorithm"""
7279

@@ -169,19 +176,15 @@ def _handle_open_tag(self, html_tag):
169176
self._close_unpaired_tag()
170177
else:
171178
self.unpairedtag_stack.append(html_tag.tag)
172-
173-
# can't be a p inside another p. Also, an open p element closes
174-
# a previous open p element.
175-
if html_tag.tag == "p" and html_tag.tag in self.labelled_tag_stacks:
176-
annotation = self.labelled_tag_stacks.pop(html_tag.tag)[0]
177-
annotation.end_index = self.next_tag_index
178-
self.annotations.append(annotation)
179-
179+
180+
tagname = replacement or self._update_replacement_stack(html_tag)
181+
self._handle_unclosed_tags(tagname, _AUTO_CLOSE_TAGS_ON_OPEN)
182+
180183
jannotation = self._read_template_annotation(html_tag)
181184
if not jannotation:
182-
if html_tag.tag in self.labelled_tag_stacks:
185+
if tagname in self.labelled_tag_stacks:
183186
# add this tag to the stack to match correct end tag
184-
self.labelled_tag_stacks[html_tag.tag].append(None)
187+
self.labelled_tag_stacks[tagname].append(None)
185188
self.next_tag_index += 1
186189
return
187190

@@ -227,7 +230,7 @@ def _handle_open_tag(self, html_tag):
227230

228231
# look for a closing tag if the content is important
229232
if annotation.surrounds_attribute:
230-
self.labelled_tag_stacks[html_tag.tag].append(annotation)
233+
self.labelled_tag_stacks[tagname].append(annotation)
231234
else:
232235
annotation.end_index = annotation.start_index + 1
233236
self.annotations.append(annotation)
@@ -239,6 +242,7 @@ def _handle_close_tag(self, html_tag):
239242
self.unpairedtag_stack.pop()
240243
else:
241244
self._close_unpaired_tag()
245+
242246
ignored_tags = self.ignored_tag_stacks.get(html_tag.tag)
243247
if ignored_tags is not None:
244248
tag = ignored_tags.pop()
@@ -250,15 +254,10 @@ def _handle_close_tag(self, html_tag):
250254
if len(ignored_tags) == 0:
251255
del self.ignored_tag_stacks[html_tag.tag]
252256

253-
if html_tag.tag in self.replacement_stacks:
254-
replacement = self.replacement_stacks[html_tag.tag].pop()
255-
if replacement:
256-
self.token_list.pop()
257-
self._add_token(replacement, html_tag.tag_type, html_tag.start, html_tag.end)
258-
if len(self.replacement_stacks[html_tag.tag]) == 0:
259-
del self.replacement_stacks[html_tag.tag]
257+
tagname = self._update_replacement_stack(html_tag)
258+
self._handle_unclosed_tags(tagname, _AUTO_CLOSE_TAGS_ON_CLOSE)
260259

261-
labelled_tags = self.labelled_tag_stacks.get(html_tag.tag)
260+
labelled_tags = self.labelled_tag_stacks.get(tagname)
262261
if labelled_tags is None:
263262
self.next_tag_index += 1
264263
return
@@ -274,12 +273,35 @@ def _handle_close_tag(self, html_tag):
274273
else:
275274
self.next_tag_index += 1
276275
if len(labelled_tags) == 0:
277-
del self.labelled_tag_stacks[html_tag.tag]
276+
del self.labelled_tag_stacks[tagname]
278277
if annotation.variant_id and self.variant_stack:
279278
prev = self.variant_stack.pop()
280279
if prev != annotation.variant_id:
281280
raise ValueError("unbalanced variant annotation tags")
282281

282+
def _update_replacement_stack(self, html_tag):
283+
replacement = html_tag.tag
284+
if html_tag.tag in self.replacement_stacks:
285+
replacement = self.replacement_stacks[html_tag.tag].pop()
286+
if replacement:
287+
self.token_list.pop()
288+
self._add_token(replacement, html_tag.tag_type, html_tag.start, html_tag.end)
289+
if len(self.replacement_stacks[html_tag.tag]) == 0:
290+
del self.replacement_stacks[html_tag.tag]
291+
return replacement
292+
293+
def _handle_unclosed_tags(self, tagname, auto_close_tags):
294+
"""I.e. can't be a p inside another p. Also, an open p element closes
295+
a previous open p element"""
296+
if tagname in auto_close_tags:
297+
for _close_tag in auto_close_tags[tagname]:
298+
if _close_tag in self.labelled_tag_stacks:
299+
annotation = self.labelled_tag_stacks.pop(_close_tag)[0]
300+
annotation.end_index = self.next_tag_index
301+
self.annotations.append(annotation)
302+
break
303+
return tagname
304+
283305
def handle_data(self, html_data_fragment, index):
284306
fragment_text = self.html_page.fragment_data(html_data_fragment)
285307
self._process_text(fragment_text)

scrapely/tests/test_pageparsing.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,20 @@
117117
</body></html>
118118
"""
119119

120+
LABELLED_PAGE5a = u"""
121+
<ul data-scrapy-replacement="select" name="txtvariant" class="smalltextblk">
122+
<li data-scrapy-replacement="option" data-scrapy-annotate="{&quot;required&quot;: [], &quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;price&quot;}, &quot;generated&quot;: false}" value="BLUE">Blue&nbsp;$9.95 - In Stock</li>
123+
<li data-scrapy-replacement="option" data-scrapy-annotate="{&quot;required&quot;: [], &quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;price&quot;}, &quot;generated&quot;: false}" value="RED">Red&nbsp;$9.95 - In Stock</li>
124+
</ul>
125+
"""
126+
127+
LABELLED_PAGE5b = u"""
128+
<ul data-scrapy-replacement="select" name="txtvariant" class="smalltextblk">
129+
<li data-scrapy-replacement="option" data-scrapy-annotate="{&quot;required&quot;: [], &quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;price&quot;}, &quot;generated&quot;: false}" value="BLUE">Blue&nbsp;$9.95 - In Stock
130+
<li data-scrapy-replacement="option" data-scrapy-annotate="{&quot;required&quot;: [], &quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;price&quot;}, &quot;generated&quot;: false}" value="RED">Red&nbsp;$9.95 - In Stock
131+
</ul>
132+
"""
133+
120134
LABELLED_PAGE6 = u"""
121135
<html><body>
122136
Text A
@@ -266,6 +280,29 @@ def test_replacement(self):
266280
p = _parse_page(TemplatePageParser, LABELLED_PAGE5)
267281
self.assertEqual(_tags(p, bool), ['<html>', '<body>', '<select>', '<option>',
268282
'</option>', '<li>', '</li>', '<option>', '</option>', '</select>', '</body>', '</html>'])
283+
284+
def test_replacement2(self):
285+
"""Replacement, with annotations"""
286+
p = _parse_page(TemplatePageParser, LABELLED_PAGE5a)
287+
self.assertEqual(_tags(p, bool), [u'<select>', u'<option>', u'</option>', u'<option>', u'</option>', u'</select>'])
288+
self.assertEqual(p.annotations[0].surrounds_attribute, 'price')
289+
self.assertEqual(p.annotations[0].start_index, 1)
290+
self.assertEqual(p.annotations[0].end_index, 2)
291+
self.assertEqual(p.annotations[1].surrounds_attribute, 'price')
292+
self.assertEqual(p.annotations[1].start_index, 3)
293+
self.assertEqual(p.annotations[1].end_index, 4)
294+
295+
296+
def test_replacement3(self):
297+
"""A second case of replacement, with annotations, not closed replaced tags"""
298+
p = _parse_page(TemplatePageParser, LABELLED_PAGE5b)
299+
self.assertEqual(_tags(p, bool), [u'<select>', u'<option>', u'<option>', u'</select>'])
300+
self.assertEqual(p.annotations[0].surrounds_attribute, 'price')
301+
self.assertEqual(p.annotations[0].start_index, 1)
302+
self.assertEqual(p.annotations[0].end_index, 2)
303+
self.assertEqual(p.annotations[1].surrounds_attribute, 'price')
304+
self.assertEqual(p.annotations[1].start_index, 2)
305+
self.assertEqual(p.annotations[1].end_index, 3)
269306

270307
def test_partial(self):
271308
"""Test partial annotation parsing"""

0 commit comments

Comments
 (0)