don't force to raise exception when an ignored region is not inside the

kalessin · kalessin · commit f21bc89fc004 · 2013-03-20T23:10:04.000Z
extract region in repeated extractions. Instead, only use the ignore
region when it is inside it.
diff --git a/scrapely/extraction/regionextract.py b/scrapely/extraction/regionextract.py
@@ -107,13 +107,9 @@ def _extract_both(self, page, start_index, end_index, ignored_regions=None, **kw
             self._extract_attribute(page, start_index, end_index, ignored_regions)
 
     def _extract_content(self, extraction_page, start_index, end_index, ignored_regions=None, **kwargs):
-        # extract content between annotation indexes
-        if not ignored_regions:
-            region = extraction_page.htmlpage_region_inside(start_index, end_index)
-        else:
-            # assumes ignored_regions are completely contained within start and end index
-            assert (start_index <= ignored_regions[0].start_index and 
-                end_index >= ignored_regions[-1].end_index)
+        """extract content between annotation indexes"""
+        if ignored_regions and (start_index <= ignored_regions[0].start_index and
+                    end_index >= ignored_regions[-1].end_index):
             starts = [start_index] + [i.end_index for i in ignored_regions if i.end_index is not None]
             ends = [i.start_index for i in ignored_regions]
             if starts[-1] is not None:
@@ -123,6 +119,8 @@ def _extract_content(self, extraction_page, start_index, end_index, ignored_regi
                 included_regions.next()
             regions = starmap(extraction_page.htmlpage_region_inside, included_regions)
             region = FragmentedHtmlPageRegion(extraction_page.htmlpage, list(regions))
+        else:
+            region = extraction_page.htmlpage_region_inside(start_index, end_index)
         validated = self.content_validate(region)
         return [(self.annotation.surrounds_attribute, validated)] if validated else []
     
diff --git a/scrapely/tests/test_extraction.py b/scrapely/tests/test_extraction.py
@@ -960,6 +960,24 @@
 </body></html>
 """
 
+# repeated elements with ignored region only in one of them
+ANNOTATED_PAGE32 = u"""
+<ul>
+<li data-scrapy-annotate="{&quot;variant&quot;: 0, 
+    &quot;annotations&quot;: {&quot;content&quot;: &quot;features&quot;}}">feature1<span data-scrapy-ignore="true"> ignore this</span></li>
+<li data-scrapy-annotate="{&quot;variant&quot;: 0, 
+    &quot;annotations&quot;: {&quot;content&quot;: &quot;features&quot;}}">feature2</li>
+</ul>
+"""
+
+EXTRACT_PAGE32 = u"""
+<ul>
+<li>feature1<span> ignore this</span></li>
+<li>feature2</li>
+<li>feature3</li>
+</ul>
+"""
+
 DEFAULT_DESCRIPTOR = ItemDescriptor('test', 
         'item test, removes tags from description attribute',
         [A('description', 'description field without tags', notags)])
@@ -1265,7 +1283,10 @@
             u'name': [u'Product name'],
             u'image_urls': [['http://example.com/image.jpg']]
         }
-    )
+    ),
+    ('single ignored region inside a repeated structure', [ANNOTATED_PAGE32], EXTRACT_PAGE32, DEFAULT_DESCRIPTOR,
+        {'features': [u'feature1', u'feature2', u'feature3']}
+    ),
 ]
 
 class TestIbl(TestCase):