Skip to content

Commit f21bc89

Browse files
committed
don't force to raise exception when an ignored region is not inside the
extract region in repeated extractions. Instead, only use the ignore region when it is inside it.
1 parent 709fc5f commit f21bc89

File tree

2 files changed

+27
-8
lines changed

2 files changed

+27
-8
lines changed

scrapely/extraction/regionextract.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -107,13 +107,9 @@ def _extract_both(self, page, start_index, end_index, ignored_regions=None, **kw
107107
self._extract_attribute(page, start_index, end_index, ignored_regions)
108108

109109
def _extract_content(self, extraction_page, start_index, end_index, ignored_regions=None, **kwargs):
110-
# extract content between annotation indexes
111-
if not ignored_regions:
112-
region = extraction_page.htmlpage_region_inside(start_index, end_index)
113-
else:
114-
# assumes ignored_regions are completely contained within start and end index
115-
assert (start_index <= ignored_regions[0].start_index and
116-
end_index >= ignored_regions[-1].end_index)
110+
"""extract content between annotation indexes"""
111+
if ignored_regions and (start_index <= ignored_regions[0].start_index and
112+
end_index >= ignored_regions[-1].end_index):
117113
starts = [start_index] + [i.end_index for i in ignored_regions if i.end_index is not None]
118114
ends = [i.start_index for i in ignored_regions]
119115
if starts[-1] is not None:
@@ -123,6 +119,8 @@ def _extract_content(self, extraction_page, start_index, end_index, ignored_regi
123119
included_regions.next()
124120
regions = starmap(extraction_page.htmlpage_region_inside, included_regions)
125121
region = FragmentedHtmlPageRegion(extraction_page.htmlpage, list(regions))
122+
else:
123+
region = extraction_page.htmlpage_region_inside(start_index, end_index)
126124
validated = self.content_validate(region)
127125
return [(self.annotation.surrounds_attribute, validated)] if validated else []
128126

scrapely/tests/test_extraction.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -960,6 +960,24 @@
960960
</body></html>
961961
"""
962962

963+
# repeated elements with ignored region only in one of them
964+
ANNOTATED_PAGE32 = u"""
965+
<ul>
966+
<li data-scrapy-annotate="{&quot;variant&quot;: 0,
967+
&quot;annotations&quot;: {&quot;content&quot;: &quot;features&quot;}}">feature1<span data-scrapy-ignore="true"> ignore this</span></li>
968+
<li data-scrapy-annotate="{&quot;variant&quot;: 0,
969+
&quot;annotations&quot;: {&quot;content&quot;: &quot;features&quot;}}">feature2</li>
970+
</ul>
971+
"""
972+
973+
EXTRACT_PAGE32 = u"""
974+
<ul>
975+
<li>feature1<span> ignore this</span></li>
976+
<li>feature2</li>
977+
<li>feature3</li>
978+
</ul>
979+
"""
980+
963981
DEFAULT_DESCRIPTOR = ItemDescriptor('test',
964982
'item test, removes tags from description attribute',
965983
[A('description', 'description field without tags', notags)])
@@ -1265,7 +1283,10 @@
12651283
u'name': [u'Product name'],
12661284
u'image_urls': [['http://example.com/image.jpg']]
12671285
}
1268-
)
1286+
),
1287+
('single ignored region inside a repeated structure', [ANNOTATED_PAGE32], EXTRACT_PAGE32, DEFAULT_DESCRIPTOR,
1288+
{'features': [u'feature1', u'feature2', u'feature3']}
1289+
),
12691290
]
12701291

12711292
class TestIbl(TestCase):

0 commit comments

Comments
 (0)