correctly extract regions that follows more than one consecutive misses

kalessin · kalessin · commit eacd37fbe254 · 2012-11-28T18:58:00.000Z
diff --git a/scrapely/extraction/regionextract.py b/scrapely/extraction/regionextract.py
@@ -409,7 +409,7 @@ def _doextract(self, page, region_elements, start_index, end_index, nested_regio
             end_index, _, following_data = self._doextract(page, following_regions, start_index, end_index, **kwargs)
             if end_index is not None:
                 pindex, sindex, extracted_data = self._doextract(page, [first_region], start_index, end_index - 1, nested_regions, ignored_regions, **kwargs)
-                extracted_data += following_data
+            extracted_data += following_data
         elif nested_regions:
             _, _, nested_data = self._doextract(page, nested_regions, start_index, end_index, **kwargs)
             extracted_data += nested_data
diff --git a/scrapely/tests/test_extraction.py b/scrapely/tests/test_extraction.py
@@ -936,6 +936,29 @@
 <div><span><script>var myvar= 10;</script></span></div>
 """
 
+ANNOTATED_PAGE31 = u"""
+<html><body>
+<div>
+<span data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">Product name</span>
+<div><p data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;price&quot;}}">60.00</p>
+<span data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;description&quot;}}">description</span>
+<span data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;features&quot;}}">features</span>
+<img data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;src&quot;: &quot;image_urls&quot;}}" src="image.jpg" />
+<table></table>
+</div></div>
+</body></html>
+"""
+
+EXTRACT_PAGE31 = u"""
+<html><body>
+<div>
+<span>Product name</span>
+<div><p>60.00</p>
+<img src="http://example.com/image.jpg" />
+<table></table>
+</div></div>
+</body></html>
+"""
 
 DEFAULT_DESCRIPTOR = ItemDescriptor('test', 
         'item test, removes tags from description attribute',
@@ -950,6 +973,15 @@
             ]
         )
 
+SAMPLE_DESCRIPTOR1a = ItemDescriptor('test', 'product test', [
+            A('name', "Product name"),
+            A('price', "Product price, including any discounts and tax or vat", 
+                contains_any_numbers),    
+            A('image_urls', "URLs for one or more images", image_url),
+            A('description', "The full description of the product", html),
+            ]
+        )
+
 SAMPLE_DESCRIPTOR2 = ItemDescriptor('test', 'item test', [
         A('description', 'description field without tags', notags),
         A('price', "Product price, including any discounts and tax or vat",
@@ -1227,6 +1259,13 @@
     ('avoid false positives on scripts', [ANNOTATED_PAGE30], EXTRACT_PAGE30d, SAMPLE_DESCRIPTOR3,
         None
     ),
+    ('correctly extract regions that follows more than one consecutive misses', [ANNOTATED_PAGE31], EXTRACT_PAGE31, SAMPLE_DESCRIPTOR1a,
+        {
+            u'price': [u'60.00'],
+            u'name': [u'Product name'],
+            u'image_urls': [['http://example.com/image.jpg']]
+        }
+    )
 ]
 
 class TestIbl(TestCase):