Skip to content

Commit eacd37f

Browse files
committed
correctly extract regions that follows more than one consecutive misses
1 parent d292a6e commit eacd37f

File tree

2 files changed

+40
-1
lines changed

2 files changed

+40
-1
lines changed

scrapely/extraction/regionextract.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -409,7 +409,7 @@ def _doextract(self, page, region_elements, start_index, end_index, nested_regio
409409
end_index, _, following_data = self._doextract(page, following_regions, start_index, end_index, **kwargs)
410410
if end_index is not None:
411411
pindex, sindex, extracted_data = self._doextract(page, [first_region], start_index, end_index - 1, nested_regions, ignored_regions, **kwargs)
412-
extracted_data += following_data
412+
extracted_data += following_data
413413
elif nested_regions:
414414
_, _, nested_data = self._doextract(page, nested_regions, start_index, end_index, **kwargs)
415415
extracted_data += nested_data

scrapely/tests/test_extraction.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -936,6 +936,29 @@
936936
<div><span><script>var myvar= 10;</script></span></div>
937937
"""
938938

939+
ANNOTATED_PAGE31 = u"""
940+
<html><body>
941+
<div>
942+
<span data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">Product name</span>
943+
<div><p data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;price&quot;}}">60.00</p>
944+
<span data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;description&quot;}}">description</span>
945+
<span data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;features&quot;}}">features</span>
946+
<img data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;src&quot;: &quot;image_urls&quot;}}" src="image.jpg" />
947+
<table></table>
948+
</div></div>
949+
</body></html>
950+
"""
951+
952+
EXTRACT_PAGE31 = u"""
953+
<html><body>
954+
<div>
955+
<span>Product name</span>
956+
<div><p>60.00</p>
957+
<img src="http://example.com/image.jpg" />
958+
<table></table>
959+
</div></div>
960+
</body></html>
961+
"""
939962

940963
DEFAULT_DESCRIPTOR = ItemDescriptor('test',
941964
'item test, removes tags from description attribute',
@@ -950,6 +973,15 @@
950973
]
951974
)
952975

976+
SAMPLE_DESCRIPTOR1a = ItemDescriptor('test', 'product test', [
977+
A('name', "Product name"),
978+
A('price', "Product price, including any discounts and tax or vat",
979+
contains_any_numbers),
980+
A('image_urls', "URLs for one or more images", image_url),
981+
A('description', "The full description of the product", html),
982+
]
983+
)
984+
953985
SAMPLE_DESCRIPTOR2 = ItemDescriptor('test', 'item test', [
954986
A('description', 'description field without tags', notags),
955987
A('price', "Product price, including any discounts and tax or vat",
@@ -1227,6 +1259,13 @@
12271259
('avoid false positives on scripts', [ANNOTATED_PAGE30], EXTRACT_PAGE30d, SAMPLE_DESCRIPTOR3,
12281260
None
12291261
),
1262+
('correctly extract regions that follows more than one consecutive misses', [ANNOTATED_PAGE31], EXTRACT_PAGE31, SAMPLE_DESCRIPTOR1a,
1263+
{
1264+
u'price': [u'60.00'],
1265+
u'name': [u'Product name'],
1266+
u'image_urls': [['http://example.com/image.jpg']]
1267+
}
1268+
)
12301269
]
12311270

12321271
class TestIbl(TestCase):

0 commit comments

Comments
 (0)