Skip to content

Commit bd34ced

Browse files
committed
Merge pull request #33 from kalessin/ignore_regions
don't force to raise exception when an ignored region is not inside the
2 parents 709fc5f + f21bc89 commit bd34ced

File tree

2 files changed

+27
-8
lines changed

2 files changed

+27
-8
lines changed

scrapely/extraction/regionextract.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -107,13 +107,9 @@ def _extract_both(self, page, start_index, end_index, ignored_regions=None, **kw
107107
self._extract_attribute(page, start_index, end_index, ignored_regions)
108108

109109
def _extract_content(self, extraction_page, start_index, end_index, ignored_regions=None, **kwargs):
110-
# extract content between annotation indexes
111-
if not ignored_regions:
112-
region = extraction_page.htmlpage_region_inside(start_index, end_index)
113-
else:
114-
# assumes ignored_regions are completely contained within start and end index
115-
assert (start_index <= ignored_regions[0].start_index and
116-
end_index >= ignored_regions[-1].end_index)
110+
"""extract content between annotation indexes"""
111+
if ignored_regions and (start_index <= ignored_regions[0].start_index and
112+
end_index >= ignored_regions[-1].end_index):
117113
starts = [start_index] + [i.end_index for i in ignored_regions if i.end_index is not None]
118114
ends = [i.start_index for i in ignored_regions]
119115
if starts[-1] is not None:
@@ -123,6 +119,8 @@ def _extract_content(self, extraction_page, start_index, end_index, ignored_regi
123119
included_regions.next()
124120
regions = starmap(extraction_page.htmlpage_region_inside, included_regions)
125121
region = FragmentedHtmlPageRegion(extraction_page.htmlpage, list(regions))
122+
else:
123+
region = extraction_page.htmlpage_region_inside(start_index, end_index)
126124
validated = self.content_validate(region)
127125
return [(self.annotation.surrounds_attribute, validated)] if validated else []
128126

scrapely/tests/test_extraction.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -960,6 +960,24 @@
960960
</body></html>
961961
"""
962962

963+
# repeated elements with ignored region only in one of them
964+
ANNOTATED_PAGE32 = u"""
965+
<ul>
966+
<li data-scrapy-annotate="{&quot;variant&quot;: 0,
967+
&quot;annotations&quot;: {&quot;content&quot;: &quot;features&quot;}}">feature1<span data-scrapy-ignore="true"> ignore this</span></li>
968+
<li data-scrapy-annotate="{&quot;variant&quot;: 0,
969+
&quot;annotations&quot;: {&quot;content&quot;: &quot;features&quot;}}">feature2</li>
970+
</ul>
971+
"""
972+
973+
EXTRACT_PAGE32 = u"""
974+
<ul>
975+
<li>feature1<span> ignore this</span></li>
976+
<li>feature2</li>
977+
<li>feature3</li>
978+
</ul>
979+
"""
980+
963981
DEFAULT_DESCRIPTOR = ItemDescriptor('test',
964982
'item test, removes tags from description attribute',
965983
[A('description', 'description field without tags', notags)])
@@ -1265,7 +1283,10 @@
12651283
u'name': [u'Product name'],
12661284
u'image_urls': [['http://example.com/image.jpg']]
12671285
}
1268-
)
1286+
),
1287+
('single ignored region inside a repeated structure', [ANNOTATED_PAGE32], EXTRACT_PAGE32, DEFAULT_DESCRIPTOR,
1288+
{'features': [u'feature1', u'feature2', u'feature3']}
1289+
),
12691290
]
12701291

12711292
class TestIbl(TestCase):

0 commit comments

Comments
 (0)