Merge pull request #22 from kalessin/master

shaneaevans · shaneaevans · commit 1d2e8cfa2aff · 2012-04-26T09:53:31.000-07:00
fix ignore region extraction when it is inside an annotation which uses text content extraction, and added test
diff --git a/scrapely/extraction/pageobjects.py b/scrapely/extraction/pageobjects.py
@@ -7,7 +7,7 @@
 from itertools import chain
 from numpy import array, ndarray
 
-from scrapely.htmlpage import HtmlTagType, HtmlPageRegion
+from scrapely.htmlpage import HtmlTagType, HtmlPageRegion, HtmlPageParsedRegion
 
 class TokenType(HtmlTagType):
     """constants for token types"""
@@ -82,7 +82,7 @@ def __str__(self):
     def __repr__(self):
         return str(self)
 
-class FragmentedHtmlPageRegion(HtmlPageRegion):
+class FragmentedHtmlPageRegion(HtmlPageParsedRegion, HtmlPageRegion):
     """An HtmlPageRegion consisting of possibly non-contiguous sub-regions"""
     def __new__(cls, htmlpage, regions):
         text = u''.join(regions)
@@ -95,11 +95,7 @@ def __init__(self, htmlpage, regions):
     @property
     def parsed_fragments(self):
         return chain(*(r.parsed_fragments for r in self.regions))
-
-    @property
-    def text_content(self):
-        return chain(*(r.text_content for r in self.regions))
-
+        
 class Page(object):
     """Basic representation of a page. This consists of a reference to a
     dictionary of tokens and an array of raw token ids
diff --git a/scrapely/tests/test_extraction.py b/scrapely/tests/test_extraction.py
@@ -960,6 +960,10 @@
         'item test',
         [A('phone', 'phone number', lambda x: contains_any_numbers(x.text_content))])
 
+SAMPLE_DESCRIPTOR4 =  ItemDescriptor('test', 
+        'item test, removes tags from description attribute',
+        [A('description', 'description field without tags', lambda x: x.text_content)])
+
 # A list of (test name, [templates], page, extractors, expected_result)
 TEST_DATA = [
     # extract from a similar page
@@ -1052,6 +1056,14 @@
              'price': [u'\n12.00\n(VAT exc.)'],
           }
     ),
+    # ignored regions and text content extraction
+    (
+    'ignored_regions', [ANNOTATED_PAGE8], EXTRACT_PAGE8, SAMPLE_DESCRIPTOR4,
+          {
+             'description': [u'\n A very nice product for all intelligent people \n \n'],
+             'price': [u'\n12.00\n(VAT exc.)'],
+          }
+    ),
     # shifted ignored regions (detected by region similarity)
     (
     'shifted_ignored_regions', [ANNOTATED_PAGE9], EXTRACT_PAGE9, DEFAULT_DESCRIPTOR,