Skip to content

Commit 741340a

Browse files
committed
Merge pull request #18 from kalessin/master
allow defining of field extractors over text content only, without the need to re-parse.
2 parents 1ce0032 + c0d849c commit 741340a

File tree

8 files changed

+149
-50
lines changed

8 files changed

+149
-50
lines changed

scrapely/extraction/pageobjects.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,10 @@ def __init__(self, htmlpage, regions):
9696
def parsed_fragments(self):
9797
return chain(*(r.parsed_fragments for r in self.regions))
9898

99+
@property
100+
def text_content(self):
101+
return chain(*(r.text_content for r in self.regions))
102+
99103
class Page(object):
100104
"""Basic representation of a page. This consists of a reference to a
101105
dictionary of tokens and an array of raw token ids

scrapely/htmlpage.py

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,15 @@ def fragment_data(self, data_fragment):
7575
"""portion of the body corresponding to the HtmlDataFragment"""
7676
return self.body[data_fragment.start:data_fragment.end]
7777

78+
class TextPage(HtmlPage):
79+
"""An HtmlPage with one unique HtmlDataFragment, needed to have a
80+
convenient text with same interface as html page but avoiding unnecesary
81+
reparsing"""
82+
def _set_body(self, text):
83+
self._body = text
84+
self.parsed_body = [HtmlDataFragment(0, len(self._body), True)]
85+
body = property(lambda x: x._body, _set_body, doc="raw text for the page")
86+
7887
class HtmlPageRegion(unicode):
7988
"""A Region of an HtmlPage that has been extracted
8089
"""
@@ -87,7 +96,11 @@ def __init__(self, htmlpage, data):
8796
htmlpage is the original page and data is the raw html
8897
"""
8998
self.htmlpage = htmlpage
90-
99+
100+
@property
101+
def text_content(self):
102+
return self
103+
91104
class HtmlPageParsedRegion(HtmlPageRegion):
92105
"""A region of an HtmlPage that has been extracted
93106
@@ -111,20 +124,31 @@ def parsed_fragments(self):
111124
end = self.end_index + 1 if self.end_index is not None else None
112125
return self.htmlpage.parsed_body[self.start_index:end]
113126

127+
@property
128+
def text_content(self):
129+
"""Text content of this parsed region"""
130+
text_all = u" ".join(self.htmlpage.body[_element.start:_element.end] \
131+
for _element in self.parsed_fragments if \
132+
not isinstance(_element, HtmlTag) and _element.is_text_content)
133+
return TextPage(self.htmlpage.url, self.htmlpage.headers, \
134+
text_all, encoding=self.htmlpage.encoding).subregion()
135+
136+
114137
class HtmlTagType(object):
115138
OPEN_TAG = 1
116139
CLOSE_TAG = 2
117140
UNPAIRED_TAG = 3
118141

119142
class HtmlDataFragment(object):
120-
__slots__ = ('start', 'end')
143+
__slots__ = ('start', 'end', 'is_text_content')
121144

122-
def __init__(self, start, end):
145+
def __init__(self, start, end, is_text_content=False):
123146
self.start = start
124147
self.end = end
148+
self.is_text_content = is_text_content
125149

126150
def __str__(self):
127-
return "<HtmlDataFragment [%s:%s]>" % (self.start, self.end)
151+
return "<HtmlDataFragment [%s:%s] is_text_content: %s>" % (self.start, self.end, self.is_text_content)
128152

129153
def __repr__(self):
130154
return str(self)
@@ -171,7 +195,7 @@ def parse_html(text):
171195
end = match.end()
172196

173197
if start > prev_end:
174-
yield HtmlDataFragment(prev_end, start)
198+
yield HtmlDataFragment(prev_end, start, True)
175199

176200
if match.groups()[0] is not None: # comment
177201
yield HtmlDataFragment(start, end)
@@ -183,7 +207,7 @@ def parse_html(text):
183207
prev_end = end
184208
textlen = len(text)
185209
if prev_end < textlen:
186-
yield HtmlDataFragment(prev_end, textlen)
210+
yield HtmlDataFragment(prev_end, textlen, True)
187211

188212
def _parse_script(match):
189213
"""parse a <script>...</script> region matched by _HTML_REGEXP"""

scrapely/tests/samples/samples_htmlpage_0.json

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -211,15 +211,18 @@
211211
},
212212
{
213213
"start": 1073,
214-
"end": 1074
214+
"end": 1074,
215+
"is_text_content": false
215216
},
216217
{
217218
"start": 1074,
218-
"end": 2052
219+
"end": 2052,
220+
"is_text_content": false
219221
},
220222
{
221223
"start": 2052,
222-
"end": 2053
224+
"end": 2053,
225+
"is_text_content": false
223226
},
224227
{
225228
"attributes": {},

scrapely/tests/samples/samples_htmlpage_1.json

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,8 @@
212212
},
213213
{
214214
"start": 1956,
215-
"end": 1979
215+
"end": 1979,
216+
"is_text_content": false
216217
},
217218
{
218219
"start": 1979,
@@ -274,15 +275,18 @@
274275
},
275276
{
276277
"start": 2282,
277-
"end": 2283
278+
"end": 2283,
279+
"is_text_content": false
278280
},
279281
{
280282
"start": 2283,
281-
"end": 2437
283+
"end": 2437,
284+
"is_text_content": false
282285
},
283286
{
284287
"start": 2437,
285-
"end": 2438
288+
"end": 2438,
289+
"is_text_content": false
286290
},
287291
{
288292
"attributes": {},
@@ -306,7 +310,8 @@
306310
},
307311
{
308312
"start": 2482,
309-
"end": 2702
313+
"end": 2702,
314+
"is_text_content": false
310315
},
311316
{
312317
"attributes": {},
@@ -330,7 +335,8 @@
330335
},
331336
{
332337
"start": 2743,
333-
"end": 2851
338+
"end": 2851,
339+
"is_text_content": false
334340
},
335341
{
336342
"attributes": {},
@@ -345,7 +351,8 @@
345351
},
346352
{
347353
"start": 2861,
348-
"end": 2882
354+
"end": 2882,
355+
"is_text_content": false
349356
},
350357
{
351358
"start": 2882,
@@ -986,7 +993,8 @@
986993
},
987994
{
988995
"start": 5346,
989-
"end": 5537
996+
"end": 5537,
997+
"is_text_content": false
990998
},
991999
{
9921000
"attributes": {},
@@ -8851,7 +8859,8 @@
88518859
},
88528860
{
88538861
"start": 30410,
8854-
"end": 30920
8862+
"end": 30920,
8863+
"is_text_content": false
88558864
},
88568865
{
88578866
"attributes": {},
@@ -9551,7 +9560,8 @@
95519560
},
95529561
{
95539562
"start": 33433,
9554-
"end": 33454
9563+
"end": 33454,
9564+
"is_text_content": false
95559565
},
95569566
{
95579567
"attributes": {
@@ -9576,7 +9586,8 @@
95769586
},
95779587
{
95789588
"start": 33669,
9579-
"end": 33689
9589+
"end": 33689,
9590+
"is_text_content": false
95809591
},
95819592
{
95829593
"attributes": {},
@@ -21129,7 +21140,8 @@
2112921140
},
2113021141
{
2113121142
"start": 70112,
21132-
"end": 70136
21143+
"end": 70136,
21144+
"is_text_content": false
2113321145
},
2113421146
{
2113521147
"attributes": {},

scrapely/tests/samples/samples_htmlpage_2.json

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -244,7 +244,8 @@
244244
},
245245
{
246246
"start": 2182,
247-
"end": 2205
247+
"end": 2205,
248+
"is_text_content": false
248249
},
249250
{
250251
"start": 2205,
@@ -306,15 +307,18 @@
306307
},
307308
{
308309
"start": 2508,
309-
"end": 2509
310+
"end": 2509,
311+
"is_text_content": false
310312
},
311313
{
312314
"start": 2509,
313-
"end": 2663
315+
"end": 2663,
316+
"is_text_content": false
314317
},
315318
{
316319
"start": 2663,
317-
"end": 2664
320+
"end": 2664,
321+
"is_text_content": false
318322
},
319323
{
320324
"attributes": {},
@@ -338,7 +342,8 @@
338342
},
339343
{
340344
"start": 2708,
341-
"end": 2928
345+
"end": 2928,
346+
"is_text_content": false
342347
},
343348
{
344349
"attributes": {},
@@ -362,7 +367,8 @@
362367
},
363368
{
364369
"start": 2969,
365-
"end": 3077
370+
"end": 3077,
371+
"is_text_content": false
366372
},
367373
{
368374
"attributes": {},
@@ -377,7 +383,8 @@
377383
},
378384
{
379385
"start": 3087,
380-
"end": 3108
386+
"end": 3108,
387+
"is_text_content": false
381388
},
382389
{
383390
"start": 3108,
@@ -993,7 +1000,8 @@
9931000
},
9941001
{
9951002
"start": 5449,
996-
"end": 5640
1003+
"end": 5640,
1004+
"is_text_content": false
9971005
},
9981006
{
9991007
"attributes": {},
@@ -8828,7 +8836,8 @@
88288836
},
88298837
{
88308838
"start": 30096,
8831-
"end": 30606
8839+
"end": 30606,
8840+
"is_text_content": false
88328841
},
88338842
{
88348843
"attributes": {},
@@ -9501,7 +9510,8 @@
95019510
},
95029511
{
95039512
"start": 32812,
9504-
"end": 32833
9513+
"end": 32833,
9514+
"is_text_content": false
95059515
},
95069516
{
95079517
"attributes": {
@@ -9526,7 +9536,8 @@
95269536
},
95279537
{
95289538
"start": 33044,
9529-
"end": 33064
9539+
"end": 33064,
9540+
"is_text_content": false
95309541
},
95319542
{
95329543
"attributes": {},
@@ -20956,7 +20967,8 @@
2095620967
},
2095720968
{
2095820969
"start": 69651,
20959-
"end": 69675
20970+
"end": 69675,
20971+
"is_text_content": false
2096020972
},
2096120973
{
2096220974
"attributes": {},

scrapely/tests/test_extraction.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -915,6 +915,28 @@
915915
</table>
916916
"""
917917

918+
ANNOTATED_PAGE30 = u"""
919+
<div data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;generated&quot;: false,
920+
&quot;annotations&quot;: {&quot;content&quot;: &quot;phone&quot;}}"><span>029349293</span></div>
921+
"""
922+
923+
EXTRACT_PAGE30a = u"""
924+
<div><span style="font-size:100%">Any text</span></div>
925+
"""
926+
927+
EXTRACT_PAGE30b = u"""
928+
<div><span style="font-size:100%">029847272</span></div>
929+
"""
930+
931+
EXTRACT_PAGE30c = u"""
932+
<div><span><!--item no. 100--></span></div>
933+
"""
934+
935+
EXTRACT_PAGE30d = u"""
936+
<div><span><script>var myvar= 10;</script></span></div>
937+
"""
938+
939+
918940
DEFAULT_DESCRIPTOR = ItemDescriptor('test',
919941
'item test, removes tags from description attribute',
920942
[A('description', 'description field without tags', notags)])
@@ -934,6 +956,9 @@
934956
contains_any_numbers),
935957
])
936958

959+
SAMPLE_DESCRIPTOR3 = ItemDescriptor('test',
960+
'item test',
961+
[A('phone', 'phone number', lambda x: contains_any_numbers(x.text_content))])
937962

938963
# A list of (test name, [templates], page, extractors, expected_result)
939964
TEST_DATA = [
@@ -1178,6 +1203,19 @@
11781203
}
11791204

11801205
),
1206+
('avoid false positives by allowing to extract only from text content', [ANNOTATED_PAGE30], EXTRACT_PAGE30a, SAMPLE_DESCRIPTOR3,
1207+
{}
1208+
),
1209+
('only extract from text content', [ANNOTATED_PAGE30], EXTRACT_PAGE30b, SAMPLE_DESCRIPTOR3,
1210+
{u'phone': [u'029847272']}
1211+
),
1212+
('avoid false positives on comments', [ANNOTATED_PAGE30], EXTRACT_PAGE30c, SAMPLE_DESCRIPTOR3,
1213+
{}
1214+
),
1215+
('avoid false positives on scripts', [ANNOTATED_PAGE30], EXTRACT_PAGE30d, SAMPLE_DESCRIPTOR3,
1216+
{}
1217+
),
1218+
11811219
]
11821220

11831221
class TestIbl(TestCase):

0 commit comments

Comments
 (0)