Merge pull request #18 from kalessin/master

shaneaevans · shaneaevans · commit 741340aefaff · 2012-02-16T05:58:51.000-08:00
allow defining of field extractors over text content only, without the need to re-parse.
diff --git a/scrapely/extraction/pageobjects.py b/scrapely/extraction/pageobjects.py
@@ -96,6 +96,10 @@ def __init__(self, htmlpage, regions):
     def parsed_fragments(self):
         return chain(*(r.parsed_fragments for r in self.regions))
 
+    @property
+    def text_content(self):
+        return chain(*(r.text_content for r in self.regions))
+
 class Page(object):
     """Basic representation of a page. This consists of a reference to a
     dictionary of tokens and an array of raw token ids
diff --git a/scrapely/htmlpage.py b/scrapely/htmlpage.py
@@ -75,6 +75,15 @@ def fragment_data(self, data_fragment):
         """portion of the body corresponding to the HtmlDataFragment"""
         return self.body[data_fragment.start:data_fragment.end]
 
+class TextPage(HtmlPage):
+    """An HtmlPage with one unique HtmlDataFragment, needed to have a
+    convenient text with same interface as html page but avoiding unnecesary
+    reparsing"""
+    def _set_body(self, text): 
+        self._body = text
+        self.parsed_body = [HtmlDataFragment(0, len(self._body), True)]
+    body = property(lambda x: x._body, _set_body, doc="raw text for the page")
+
 class HtmlPageRegion(unicode):
     """A Region of an HtmlPage that has been extracted
     """
@@ -87,7 +96,11 @@ def __init__(self, htmlpage, data):
         htmlpage is the original page and data is the raw html
         """
         self.htmlpage = htmlpage
-    
+ 
+    @property
+    def text_content(self):
+        return self
+        
 class HtmlPageParsedRegion(HtmlPageRegion):
     """A region of an HtmlPage that has been extracted
 
@@ -111,20 +124,31 @@ def parsed_fragments(self):
         end = self.end_index + 1 if self.end_index is not None else None
         return self.htmlpage.parsed_body[self.start_index:end]
 
+    @property
+    def text_content(self):
+        """Text content of this parsed region"""
+        text_all = u" ".join(self.htmlpage.body[_element.start:_element.end] \
+                for _element in self.parsed_fragments if \
+                not isinstance(_element, HtmlTag) and _element.is_text_content)
+        return TextPage(self.htmlpage.url, self.htmlpage.headers, \
+                text_all, encoding=self.htmlpage.encoding).subregion()
+
+
 class HtmlTagType(object):
     OPEN_TAG = 1
     CLOSE_TAG = 2 
     UNPAIRED_TAG = 3
 
 class HtmlDataFragment(object):
-    __slots__ = ('start', 'end')
+    __slots__ = ('start', 'end', 'is_text_content')
     
-    def __init__(self, start, end):
+    def __init__(self, start, end, is_text_content=False):
         self.start = start
         self.end = end
+        self.is_text_content = is_text_content
         
     def __str__(self):
-        return "<HtmlDataFragment [%s:%s]>" % (self.start, self.end)
+        return "<HtmlDataFragment [%s:%s] is_text_content: %s>" % (self.start, self.end, self.is_text_content)
 
     def __repr__(self):
         return str(self)
@@ -171,7 +195,7 @@ def parse_html(text):
         end = match.end()
             
         if start > prev_end:
-            yield HtmlDataFragment(prev_end, start)
+            yield HtmlDataFragment(prev_end, start, True)
 
         if match.groups()[0] is not None: # comment
             yield HtmlDataFragment(start, end)
@@ -183,7 +207,7 @@ def parse_html(text):
         prev_end = end
     textlen = len(text)
     if prev_end < textlen:
-        yield HtmlDataFragment(prev_end, textlen)
+        yield HtmlDataFragment(prev_end, textlen, True)
 
 def _parse_script(match):
     """parse a <script>...</script> region matched by _HTML_REGEXP"""
diff --git a/scrapely/tests/samples/samples_htmlpage_0.json b/scrapely/tests/samples/samples_htmlpage_0.json
@@ -211,15 +211,18 @@
         },
         {       
                 "start": 1073,
-                "end": 1074
+                "end": 1074,
+                "is_text_content": false
         },
         {
                 "start": 1074, 
-                "end": 2052
+                "end": 2052,
+                "is_text_content": false
         },
         {
                 "start": 2052,
-                "end": 2053
+                "end": 2053,
+                "is_text_content": false
         },
         {
                 "attributes": {}, 
diff --git a/scrapely/tests/samples/samples_htmlpage_1.json b/scrapely/tests/samples/samples_htmlpage_1.json
@@ -212,7 +212,8 @@
         },
         {
                 "start": 1956,
-                "end": 1979
+                "end": 1979,
+                "is_text_content": false
         },
         {
                 "start": 1979,
@@ -274,15 +275,18 @@
         }, 
         {
                 "start": 2282, 
-                "end": 2283
+                "end": 2283,
+                "is_text_content": false
         },
         {
                 "start": 2283,
-                "end": 2437
+                "end": 2437,
+                "is_text_content": false
         },
         {
                 "start": 2437,
-                "end": 2438
+                "end": 2438,
+                "is_text_content": false
         }, 
         {
                 "attributes": {}, 
@@ -306,7 +310,8 @@
         }, 
         {
                 "start": 2482, 
-                "end": 2702
+                "end": 2702,
+                "is_text_content": false
         }, 
         {
                 "attributes": {}, 
@@ -330,7 +335,8 @@
         }, 
         {
                 "start": 2743, 
-                "end": 2851
+                "end": 2851,
+                "is_text_content": false
         }, 
         {
                 "attributes": {}, 
@@ -345,7 +351,8 @@
         },
         {
                 "start": 2861,
-                "end": 2882
+                "end": 2882,
+                "is_text_content": false
         },
         {
                 "start": 2882,
@@ -986,7 +993,8 @@
         }, 
         {
                 "start": 5346, 
-                "end": 5537
+                "end": 5537,
+                "is_text_content": false
         }, 
         {
                 "attributes": {}, 
@@ -8851,7 +8859,8 @@
         }, 
         {
                 "start": 30410, 
-                "end": 30920
+                "end": 30920,
+                "is_text_content": false
         }, 
         {
                 "attributes": {}, 
@@ -9551,7 +9560,8 @@
         }, 
         {
                 "start": 33433, 
-                "end": 33454
+                "end": 33454,
+                "is_text_content": false
         }, 
         {
                 "attributes": {
@@ -9576,7 +9586,8 @@
         }, 
         {
                 "start": 33669, 
-                "end": 33689
+                "end": 33689,
+                "is_text_content": false
         }, 
         {
                 "attributes": {}, 
@@ -21129,7 +21140,8 @@
         }, 
         {
                 "start": 70112, 
-                "end": 70136
+                "end": 70136,
+                "is_text_content": false
         }, 
         {
                 "attributes": {}, 
diff --git a/scrapely/tests/samples/samples_htmlpage_2.json b/scrapely/tests/samples/samples_htmlpage_2.json
@@ -244,7 +244,8 @@
         },
         {
                 "start": 2182,
-                "end": 2205
+                "end": 2205,
+                "is_text_content": false
         },
         {
                 "start": 2205,
@@ -306,15 +307,18 @@
         }, 
         {
                 "start": 2508, 
-                "end": 2509
+                "end": 2509,
+                "is_text_content": false
         },
         {
                 "start": 2509,
-                "end": 2663
+                "end": 2663,
+                "is_text_content": false
         },
         {
                 "start": 2663,
-                "end": 2664
+                "end": 2664,
+                "is_text_content": false
         }, 
         {
                 "attributes": {}, 
@@ -338,7 +342,8 @@
         }, 
         {
                 "start": 2708, 
-                "end": 2928
+                "end": 2928,
+                "is_text_content": false
         }, 
         {
                 "attributes": {}, 
@@ -362,7 +367,8 @@
         }, 
         {
                 "start": 2969, 
-                "end": 3077
+                "end": 3077,
+                "is_text_content": false
         }, 
         {
                 "attributes": {}, 
@@ -377,7 +383,8 @@
         },
         {
                 "start": 3087,
-                "end": 3108
+                "end": 3108,
+                "is_text_content": false
         },
         {
                 "start": 3108,
@@ -993,7 +1000,8 @@
         }, 
         {
                 "start": 5449, 
-                "end": 5640
+                "end": 5640,
+                "is_text_content": false
         }, 
         {
                 "attributes": {}, 
@@ -8828,7 +8836,8 @@
         }, 
         {
                 "start": 30096, 
-                "end": 30606
+                "end": 30606,
+                "is_text_content": false
         }, 
         {
                 "attributes": {}, 
@@ -9501,7 +9510,8 @@
         }, 
         {
                 "start": 32812, 
-                "end": 32833
+                "end": 32833,
+                "is_text_content": false
         }, 
         {
                 "attributes": {
@@ -9526,7 +9536,8 @@
         }, 
         {
                 "start": 33044, 
-                "end": 33064
+                "end": 33064,
+                "is_text_content": false
         }, 
         {
                 "attributes": {}, 
@@ -20956,7 +20967,8 @@
         }, 
         {
                 "start": 69651, 
-                "end": 69675
+                "end": 69675,
+                "is_text_content": false
         }, 
         {
                 "attributes": {}, 
diff --git a/scrapely/tests/test_extraction.py b/scrapely/tests/test_extraction.py
@@ -915,6 +915,28 @@
 </table>
 """
 
+ANNOTATED_PAGE30 = u"""
+<div data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;generated&quot;: false,
+ &quot;annotations&quot;: {&quot;content&quot;: &quot;phone&quot;}}"><span>029349293</span></div>
+"""
+
+EXTRACT_PAGE30a = u"""
+<div><span style="font-size:100%">Any text</span></div>
+"""
+
+EXTRACT_PAGE30b = u"""
+<div><span style="font-size:100%">029847272</span></div>
+"""
+
+EXTRACT_PAGE30c = u"""
+<div><span><!--item no. 100--></span></div>
+"""
+
+EXTRACT_PAGE30d = u"""
+<div><span><script>var myvar= 10;</script></span></div>
+"""
+
+
 DEFAULT_DESCRIPTOR = ItemDescriptor('test', 
         'item test, removes tags from description attribute',
         [A('description', 'description field without tags', notags)])
@@ -934,6 +956,9 @@
                 contains_any_numbers),
     ])
 
+SAMPLE_DESCRIPTOR3 = ItemDescriptor('test', 
+        'item test',
+        [A('phone', 'phone number', lambda x: contains_any_numbers(x.text_content))])
 
 # A list of (test name, [templates], page, extractors, expected_result)
 TEST_DATA = [
@@ -1178,6 +1203,19 @@
             }
 
     ),
+    ('avoid false positives by allowing to extract only from text content', [ANNOTATED_PAGE30], EXTRACT_PAGE30a, SAMPLE_DESCRIPTOR3,
+        {}
+    ),
+    ('only extract from text content', [ANNOTATED_PAGE30], EXTRACT_PAGE30b, SAMPLE_DESCRIPTOR3,
+        {u'phone': [u'029847272']}
+    ),
+    ('avoid false positives on comments', [ANNOTATED_PAGE30], EXTRACT_PAGE30c, SAMPLE_DESCRIPTOR3,
+        {}
+    ),
+    ('avoid false positives on scripts', [ANNOTATED_PAGE30], EXTRACT_PAGE30d, SAMPLE_DESCRIPTOR3,
+        {}
+    ),
+
 ]
 
 class TestIbl(TestCase):
diff --git a/scrapely/tests/test_htmlpage.py b/scrapely/tests/test_htmlpage.py
diff --git a/scrapely/tests/test_htmlpage_data.py b/scrapely/tests/test_htmlpage_data.py

Original file line number	Diff line number	Diff line change
`@@ -211,15 +211,18 @@`
`211`	`211`	`},`
`212`	`212`	`{`
`213`	`213`	`"start": 1073,`
`214`		`- "end": 1074`
	`214`	`+ "end": 1074,`
	`215`	`+ "is_text_content": false`
`215`	`216`	`},`
`216`	`217`	`{`
`217`	`218`	`"start": 1074,`
`218`		`- "end": 2052`
	`219`	`+ "end": 2052,`
	`220`	`+ "is_text_content": false`
`219`	`221`	`},`
`220`	`222`	`{`
`221`	`223`	`"start": 2052,`
`222`		`- "end": 2053`
	`224`	`+ "end": 2053,`
	`225`	`+ "is_text_content": false`
`223`	`226`	`},`
`224`	`227`	`{`
`225`	`228`	`"attributes": {},`
Original file line number	Diff line number	Diff line change
`@@ -212,7 +212,8 @@`
`212`	`212`	`},`
`213`	`213`	`{`
`214`	`214`	`"start": 1956,`
`215`		`- "end": 1979`
	`215`	`+ "end": 1979,`
	`216`	`+ "is_text_content": false`
`216`	`217`	`},`
`217`	`218`	`{`
`218`	`219`	`"start": 1979,`
`@@ -274,15 +275,18 @@`
`274`	`275`	`},`
`275`	`276`	`{`
`276`	`277`	`"start": 2282,`
`277`		`- "end": 2283`
	`278`	`+ "end": 2283,`
	`279`	`+ "is_text_content": false`
`278`	`280`	`},`
`279`	`281`	`{`
`280`	`282`	`"start": 2283,`
`281`		`- "end": 2437`
	`283`	`+ "end": 2437,`
	`284`	`+ "is_text_content": false`
`282`	`285`	`},`
`283`	`286`	`{`
`284`	`287`	`"start": 2437,`
`285`		`- "end": 2438`
	`288`	`+ "end": 2438,`
	`289`	`+ "is_text_content": false`
`286`	`290`	`},`
`287`	`291`	`{`
`288`	`292`	`"attributes": {},`
`@@ -306,7 +310,8 @@`
`306`	`310`	`},`
`307`	`311`	`{`
`308`	`312`	`"start": 2482,`
`309`		`- "end": 2702`
	`313`	`+ "end": 2702,`
	`314`	`+ "is_text_content": false`
`310`	`315`	`},`
`311`	`316`	`{`
`312`	`317`	`"attributes": {},`
`@@ -330,7 +335,8 @@`
`330`	`335`	`},`
`331`	`336`	`{`
`332`	`337`	`"start": 2743,`
`333`		`- "end": 2851`
	`338`	`+ "end": 2851,`
	`339`	`+ "is_text_content": false`
`334`	`340`	`},`
`335`	`341`	`{`
`336`	`342`	`"attributes": {},`
`@@ -345,7 +351,8 @@`
`345`	`351`	`},`
`346`	`352`	`{`
`347`	`353`	`"start": 2861,`
`348`		`- "end": 2882`
	`354`	`+ "end": 2882,`
	`355`	`+ "is_text_content": false`
`349`	`356`	`},`
`350`	`357`	`{`
`351`	`358`	`"start": 2882,`
`@@ -986,7 +993,8 @@`
`986`	`993`	`},`
`987`	`994`	`{`
`988`	`995`	`"start": 5346,`
`989`		`- "end": 5537`
	`996`	`+ "end": 5537,`
	`997`	`+ "is_text_content": false`
`990`	`998`	`},`
`991`	`999`	`{`
`992`	`1000`	`"attributes": {},`
`@@ -8851,7 +8859,8 @@`
`8851`	`8859`	`},`
`8852`	`8860`	`{`
`8853`	`8861`	`"start": 30410,`
`8854`		`- "end": 30920`
	`8862`	`+ "end": 30920,`
	`8863`	`+ "is_text_content": false`
`8855`	`8864`	`},`
`8856`	`8865`	`{`
`8857`	`8866`	`"attributes": {},`
`@@ -9551,7 +9560,8 @@`
`9551`	`9560`	`},`
`9552`	`9561`	`{`
`9553`	`9562`	`"start": 33433,`
`9554`		`- "end": 33454`
	`9563`	`+ "end": 33454,`
	`9564`	`+ "is_text_content": false`
`9555`	`9565`	`},`
`9556`	`9566`	`{`
`9557`	`9567`	`"attributes": {`
`@@ -9576,7 +9586,8 @@`
`9576`	`9586`	`},`
`9577`	`9587`	`{`
`9578`	`9588`	`"start": 33669,`
`9579`		`- "end": 33689`
	`9589`	`+ "end": 33689,`
	`9590`	`+ "is_text_content": false`
`9580`	`9591`	`},`
`9581`	`9592`	`{`
`9582`	`9593`	`"attributes": {},`
`@@ -21129,7 +21140,8 @@`
`21129`	`21140`	`},`
`21130`	`21141`	`{`
`21131`	`21142`	`"start": 70112,`
`21132`		`- "end": 70136`
	`21143`	`+ "end": 70136,`
	`21144`	`+ "is_text_content": false`
`21133`	`21145`	`},`
`21134`	`21146`	`{`
`21135`	`21147`	`"attributes": {},`
Original file line number	Diff line number	Diff line change
`@@ -244,7 +244,8 @@`
`244`	`244`	`},`
`245`	`245`	`{`
`246`	`246`	`"start": 2182,`
`247`		`- "end": 2205`
	`247`	`+ "end": 2205,`
	`248`	`+ "is_text_content": false`
`248`	`249`	`},`
`249`	`250`	`{`
`250`	`251`	`"start": 2205,`
`@@ -306,15 +307,18 @@`
`306`	`307`	`},`
`307`	`308`	`{`
`308`	`309`	`"start": 2508,`
`309`		`- "end": 2509`
	`310`	`+ "end": 2509,`
	`311`	`+ "is_text_content": false`
`310`	`312`	`},`
`311`	`313`	`{`
`312`	`314`	`"start": 2509,`
`313`		`- "end": 2663`
	`315`	`+ "end": 2663,`
	`316`	`+ "is_text_content": false`
`314`	`317`	`},`
`315`	`318`	`{`
`316`	`319`	`"start": 2663,`
`317`		`- "end": 2664`
	`320`	`+ "end": 2664,`
	`321`	`+ "is_text_content": false`
`318`	`322`	`},`
`319`	`323`	`{`
`320`	`324`	`"attributes": {},`
`@@ -338,7 +342,8 @@`
`338`	`342`	`},`
`339`	`343`	`{`
`340`	`344`	`"start": 2708,`
`341`		`- "end": 2928`
	`345`	`+ "end": 2928,`
	`346`	`+ "is_text_content": false`
`342`	`347`	`},`
`343`	`348`	`{`
`344`	`349`	`"attributes": {},`
`@@ -362,7 +367,8 @@`
`362`	`367`	`},`
`363`	`368`	`{`
`364`	`369`	`"start": 2969,`
`365`		`- "end": 3077`
	`370`	`+ "end": 3077,`
	`371`	`+ "is_text_content": false`
`366`	`372`	`},`
`367`	`373`	`{`
`368`	`374`	`"attributes": {},`
`@@ -377,7 +383,8 @@`
`377`	`383`	`},`
`378`	`384`	`{`
`379`	`385`	`"start": 3087,`
`380`		`- "end": 3108`
	`386`	`+ "end": 3108,`
	`387`	`+ "is_text_content": false`
`381`	`388`	`},`
`382`	`389`	`{`
`383`	`390`	`"start": 3108,`
`@@ -993,7 +1000,8 @@`
`993`	`1000`	`},`
`994`	`1001`	`{`
`995`	`1002`	`"start": 5449,`
`996`		`- "end": 5640`
	`1003`	`+ "end": 5640,`
	`1004`	`+ "is_text_content": false`
`997`	`1005`	`},`
`998`	`1006`	`{`
`999`	`1007`	`"attributes": {},`
`@@ -8828,7 +8836,8 @@`
`8828`	`8836`	`},`
`8829`	`8837`	`{`
`8830`	`8838`	`"start": 30096,`
`8831`		`- "end": 30606`
	`8839`	`+ "end": 30606,`
	`8840`	`+ "is_text_content": false`
`8832`	`8841`	`},`
`8833`	`8842`	`{`
`8834`	`8843`	`"attributes": {},`
`@@ -9501,7 +9510,8 @@`
`9501`	`9510`	`},`
`9502`	`9511`	`{`
`9503`	`9512`	`"start": 32812,`
`9504`		`- "end": 32833`
	`9513`	`+ "end": 32833,`
	`9514`	`+ "is_text_content": false`
`9505`	`9515`	`},`
`9506`	`9516`	`{`
`9507`	`9517`	`"attributes": {`
`@@ -9526,7 +9536,8 @@`
`9526`	`9536`	`},`
`9527`	`9537`	`{`
`9528`	`9538`	`"start": 33044,`
`9529`		`- "end": 33064`
	`9539`	`+ "end": 33064,`
	`9540`	`+ "is_text_content": false`
`9530`	`9541`	`},`
`9531`	`9542`	`{`
`9532`	`9543`	`"attributes": {},`
`@@ -20956,7 +20967,8 @@`
`20956`	`20967`	`},`
`20957`	`20968`	`{`
`20958`	`20969`	`"start": 69651,`
`20959`		`- "end": 69675`
	`20970`	`+ "end": 69675,`
	`20971`	`+ "is_text_content": false`
`20960`	`20972`	`},`
`20961`	`20973`	`{`
`20962`	`20974`	`"attributes": {},`