Skip to content

Commit c0d849c

Browse files
author
olveyra
committed
allow to define field extractors over text contents only, without need to re parsing. Added tests and fixed current ones, as a special flag was added on HtmlDataFragment objects
1 parent 3e6244a commit c0d849c

File tree

8 files changed

+149
-50
lines changed

8 files changed

+149
-50
lines changed

scrapely/extraction/pageobjects.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,10 @@ def __init__(self, htmlpage, regions):
9696
def parsed_fragments(self):
9797
return chain(*(r.parsed_fragments for r in self.regions))
9898

99+
@property
100+
def text_content(self):
101+
return chain(*(r.text_content for r in self.regions))
102+
99103
class Page(object):
100104
"""Basic representation of a page. This consists of a reference to a
101105
dictionary of tokens and an array of raw token ids

scrapely/htmlpage.py

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,15 @@ def fragment_data(self, data_fragment):
7575
"""portion of the body corresponding to the HtmlDataFragment"""
7676
return self.body[data_fragment.start:data_fragment.end]
7777

78+
class TextPage(HtmlPage):
79+
"""An HtmlPage with one unique HtmlDataFragment, needed to have a
80+
convenient text with same interface as html page but avoiding unnecesary
81+
reparsing"""
82+
def _set_body(self, text):
83+
self._body = text
84+
self.parsed_body = [HtmlDataFragment(0, len(self._body), True)]
85+
body = property(lambda x: x._body, _set_body, doc="raw text for the page")
86+
7887
class HtmlPageRegion(unicode):
7988
"""A Region of an HtmlPage that has been extracted
8089
"""
@@ -87,7 +96,11 @@ def __init__(self, htmlpage, data):
8796
htmlpage is the original page and data is the raw html
8897
"""
8998
self.htmlpage = htmlpage
90-
99+
100+
@property
101+
def text_content(self):
102+
return self
103+
91104
class HtmlPageParsedRegion(HtmlPageRegion):
92105
"""A region of an HtmlPage that has been extracted
93106
@@ -111,20 +124,31 @@ def parsed_fragments(self):
111124
end = self.end_index + 1 if self.end_index is not None else None
112125
return self.htmlpage.parsed_body[self.start_index:end]
113126

127+
@property
128+
def text_content(self):
129+
"""Text content of this parsed region"""
130+
text_all = u" ".join(self.htmlpage.body[_element.start:_element.end] \
131+
for _element in self.parsed_fragments if \
132+
not isinstance(_element, HtmlTag) and _element.is_text_content)
133+
return TextPage(self.htmlpage.url, self.htmlpage.headers, \
134+
text_all, encoding=self.htmlpage.encoding).subregion()
135+
136+
114137
class HtmlTagType(object):
115138
OPEN_TAG = 1
116139
CLOSE_TAG = 2
117140
UNPAIRED_TAG = 3
118141

119142
class HtmlDataFragment(object):
120-
__slots__ = ('start', 'end')
143+
__slots__ = ('start', 'end', 'is_text_content')
121144

122-
def __init__(self, start, end):
145+
def __init__(self, start, end, is_text_content=False):
123146
self.start = start
124147
self.end = end
148+
self.is_text_content = is_text_content
125149

126150
def __str__(self):
127-
return "<HtmlDataFragment [%s:%s]>" % (self.start, self.end)
151+
return "<HtmlDataFragment [%s:%s] is_text_content: %s>" % (self.start, self.end, self.is_text_content)
128152

129153
def __repr__(self):
130154
return str(self)
@@ -171,7 +195,7 @@ def parse_html(text):
171195
end = match.end()
172196

173197
if start > prev_end:
174-
yield HtmlDataFragment(prev_end, start)
198+
yield HtmlDataFragment(prev_end, start, True)
175199

176200
if match.groups()[0] is not None: # comment
177201
yield HtmlDataFragment(start, end)
@@ -183,7 +207,7 @@ def parse_html(text):
183207
prev_end = end
184208
textlen = len(text)
185209
if prev_end < textlen:
186-
yield HtmlDataFragment(prev_end, textlen)
210+
yield HtmlDataFragment(prev_end, textlen, True)
187211

188212
def _parse_script(match):
189213
"""parse a <script>...</script> region matched by _HTML_REGEXP"""

scrapely/tests/samples/samples_htmlpage_0.json

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -211,15 +211,18 @@
211211
},
212212
{
213213
"start": 1073,
214-
"end": 1074
214+
"end": 1074,
215+
"is_text_content": false
215216
},
216217
{
217218
"start": 1074,
218-
"end": 2052
219+
"end": 2052,
220+
"is_text_content": false
219221
},
220222
{
221223
"start": 2052,
222-
"end": 2053
224+
"end": 2053,
225+
"is_text_content": false
223226
},
224227
{
225228
"attributes": {},

scrapely/tests/samples/samples_htmlpage_1.json

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,8 @@
212212
},
213213
{
214214
"start": 1956,
215-
"end": 1979
215+
"end": 1979,
216+
"is_text_content": false
216217
},
217218
{
218219
"start": 1979,
@@ -274,15 +275,18 @@
274275
},
275276
{
276277
"start": 2282,
277-
"end": 2283
278+
"end": 2283,
279+
"is_text_content": false
278280
},
279281
{
280282
"start": 2283,
281-
"end": 2437
283+
"end": 2437,
284+
"is_text_content": false
282285
},
283286
{
284287
"start": 2437,
285-
"end": 2438
288+
"end": 2438,
289+
"is_text_content": false
286290
},
287291
{
288292
"attributes": {},
@@ -306,7 +310,8 @@
306310
},
307311
{
308312
"start": 2482,
309-
"end": 2702
313+
"end": 2702,
314+
"is_text_content": false
310315
},
311316
{
312317
"attributes": {},
@@ -330,7 +335,8 @@
330335
},
331336
{
332337
"start": 2743,
333-
"end": 2851
338+
"end": 2851,
339+
"is_text_content": false
334340
},
335341
{
336342
"attributes": {},
@@ -345,7 +351,8 @@
345351
},
346352
{
347353
"start": 2861,
348-
"end": 2882
354+
"end": 2882,
355+
"is_text_content": false
349356
},
350357
{
351358
"start": 2882,
@@ -986,7 +993,8 @@
986993
},
987994
{
988995
"start": 5346,
989-
"end": 5537
996+
"end": 5537,
997+
"is_text_content": false
990998
},
991999
{
9921000
"attributes": {},
@@ -8851,7 +8859,8 @@
88518859
},
88528860
{
88538861
"start": 30410,
8854-
"end": 30920
8862+
"end": 30920,
8863+
"is_text_content": false
88558864
},
88568865
{
88578866
"attributes": {},
@@ -9551,7 +9560,8 @@
95519560
},
95529561
{
95539562
"start": 33433,
9554-
"end": 33454
9563+
"end": 33454,
9564+
"is_text_content": false
95559565
},
95569566
{
95579567
"attributes": {
@@ -9576,7 +9586,8 @@
95769586
},
95779587
{
95789588
"start": 33669,
9579-
"end": 33689
9589+
"end": 33689,
9590+
"is_text_content": false
95809591
},
95819592
{
95829593
"attributes": {},
@@ -21129,7 +21140,8 @@
2112921140
},
2113021141
{
2113121142
"start": 70112,
21132-
"end": 70136
21143+
"end": 70136,
21144+
"is_text_content": false
2113321145
},
2113421146
{
2113521147
"attributes": {},

scrapely/tests/samples/samples_htmlpage_2.json

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -244,7 +244,8 @@
244244
},
245245
{
246246
"start": 2182,
247-
"end": 2205
247+
"end": 2205,
248+
"is_text_content": false
248249
},
249250
{
250251
"start": 2205,
@@ -306,15 +307,18 @@
306307
},
307308
{
308309
"start": 2508,
309-
"end": 2509
310+
"end": 2509,
311+
"is_text_content": false
310312
},
311313
{
312314
"start": 2509,
313-
"end": 2663
315+
"end": 2663,
316+
"is_text_content": false
314317
},
315318
{
316319
"start": 2663,
317-
"end": 2664
320+
"end": 2664,
321+
"is_text_content": false
318322
},
319323
{
320324
"attributes": {},
@@ -338,7 +342,8 @@
338342
},
339343
{
340344
"start": 2708,
341-
"end": 2928
345+
"end": 2928,
346+
"is_text_content": false
342347
},
343348
{
344349
"attributes": {},
@@ -362,7 +367,8 @@
362367
},
363368
{
364369
"start": 2969,
365-
"end": 3077
370+
"end": 3077,
371+
"is_text_content": false
366372
},
367373
{
368374
"attributes": {},
@@ -377,7 +383,8 @@
377383
},
378384
{
379385
"start": 3087,
380-
"end": 3108
386+
"end": 3108,
387+
"is_text_content": false
381388
},
382389
{
383390
"start": 3108,
@@ -993,7 +1000,8 @@
9931000
},
9941001
{
9951002
"start": 5449,
996-
"end": 5640
1003+
"end": 5640,
1004+
"is_text_content": false
9971005
},
9981006
{
9991007
"attributes": {},
@@ -8828,7 +8836,8 @@
88288836
},
88298837
{
88308838
"start": 30096,
8831-
"end": 30606
8839+
"end": 30606,
8840+
"is_text_content": false
88328841
},
88338842
{
88348843
"attributes": {},
@@ -9501,7 +9510,8 @@
95019510
},
95029511
{
95039512
"start": 32812,
9504-
"end": 32833
9513+
"end": 32833,
9514+
"is_text_content": false
95059515
},
95069516
{
95079517
"attributes": {
@@ -9526,7 +9536,8 @@
95269536
},
95279537
{
95289538
"start": 33044,
9529-
"end": 33064
9539+
"end": 33064,
9540+
"is_text_content": false
95309541
},
95319542
{
95329543
"attributes": {},
@@ -20956,7 +20967,8 @@
2095620967
},
2095720968
{
2095820969
"start": 69651,
20959-
"end": 69675
20970+
"end": 69675,
20971+
"is_text_content": false
2096020972
},
2096120973
{
2096220974
"attributes": {},

scrapely/tests/test_extraction.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -915,6 +915,28 @@
915915
</table>
916916
"""
917917

918+
ANNOTATED_PAGE30 = u"""
919+
<div data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;generated&quot;: false,
920+
&quot;annotations&quot;: {&quot;content&quot;: &quot;phone&quot;}}"><span>029349293</span></div>
921+
"""
922+
923+
EXTRACT_PAGE30a = u"""
924+
<div><span style="font-size:100%">Any text</span></div>
925+
"""
926+
927+
EXTRACT_PAGE30b = u"""
928+
<div><span style="font-size:100%">029847272</span></div>
929+
"""
930+
931+
EXTRACT_PAGE30c = u"""
932+
<div><span><!--item no. 100--></span></div>
933+
"""
934+
935+
EXTRACT_PAGE30d = u"""
936+
<div><span><script>var myvar= 10;</script></span></div>
937+
"""
938+
939+
918940
DEFAULT_DESCRIPTOR = ItemDescriptor('test',
919941
'item test, removes tags from description attribute',
920942
[A('description', 'description field without tags', notags)])
@@ -934,6 +956,9 @@
934956
contains_any_numbers),
935957
])
936958

959+
SAMPLE_DESCRIPTOR3 = ItemDescriptor('test',
960+
'item test',
961+
[A('phone', 'phone number', lambda x: contains_any_numbers(x.text_content))])
937962

938963
# A list of (test name, [templates], page, extractors, expected_result)
939964
TEST_DATA = [
@@ -1178,6 +1203,19 @@
11781203
}
11791204

11801205
),
1206+
('avoid false positives by allowing to extract only from text content', [ANNOTATED_PAGE30], EXTRACT_PAGE30a, SAMPLE_DESCRIPTOR3,
1207+
{}
1208+
),
1209+
('only extract from text content', [ANNOTATED_PAGE30], EXTRACT_PAGE30b, SAMPLE_DESCRIPTOR3,
1210+
{u'phone': [u'029847272']}
1211+
),
1212+
('avoid false positives on comments', [ANNOTATED_PAGE30], EXTRACT_PAGE30c, SAMPLE_DESCRIPTOR3,
1213+
{}
1214+
),
1215+
('avoid false positives on scripts', [ANNOTATED_PAGE30], EXTRACT_PAGE30d, SAMPLE_DESCRIPTOR3,
1216+
{}
1217+
),
1218+
11811219
]
11821220

11831221
class TestIbl(TestCase):

0 commit comments

Comments
 (0)