Skip to content

Commit 2f8e9d4

Browse files
committed
Merge branch 'master' of github.com:scrapy/scrapely into scraper_refactor
2 parents 0d75be4 + 741340a commit 2f8e9d4

File tree

8 files changed

+149
-50
lines changed

8 files changed

+149
-50
lines changed

scrapely/extraction/pageobjects.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,10 @@ def __init__(self, htmlpage, regions):
9696
def parsed_fragments(self):
9797
return chain(*(r.parsed_fragments for r in self.regions))
9898

99+
@property
100+
def text_content(self):
101+
return chain(*(r.text_content for r in self.regions))
102+
99103
class Page(object):
100104
"""Basic representation of a page. This consists of a reference to a
101105
dictionary of tokens and an array of raw token ids

scrapely/htmlpage.py

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,15 @@ def fragment_data(self, data_fragment):
9595
"""portion of the body corresponding to the HtmlDataFragment"""
9696
return self.body[data_fragment.start:data_fragment.end]
9797

98+
class TextPage(HtmlPage):
99+
"""An HtmlPage with one unique HtmlDataFragment, needed to have a
100+
convenient text with same interface as html page but avoiding unnecesary
101+
reparsing"""
102+
def _set_body(self, text):
103+
self._body = text
104+
self.parsed_body = [HtmlDataFragment(0, len(self._body), True)]
105+
body = property(lambda x: x._body, _set_body, doc="raw text for the page")
106+
98107
class HtmlPageRegion(unicode):
99108
"""A Region of an HtmlPage that has been extracted
100109
"""
@@ -107,7 +116,11 @@ def __init__(self, htmlpage, data):
107116
htmlpage is the original page and data is the raw html
108117
"""
109118
self.htmlpage = htmlpage
110-
119+
120+
@property
121+
def text_content(self):
122+
return self
123+
111124
class HtmlPageParsedRegion(HtmlPageRegion):
112125
"""A region of an HtmlPage that has been extracted
113126
@@ -131,20 +144,31 @@ def parsed_fragments(self):
131144
end = self.end_index + 1 if self.end_index is not None else None
132145
return self.htmlpage.parsed_body[self.start_index:end]
133146

147+
@property
148+
def text_content(self):
149+
"""Text content of this parsed region"""
150+
text_all = u" ".join(self.htmlpage.body[_element.start:_element.end] \
151+
for _element in self.parsed_fragments if \
152+
not isinstance(_element, HtmlTag) and _element.is_text_content)
153+
return TextPage(self.htmlpage.url, self.htmlpage.headers, \
154+
text_all, encoding=self.htmlpage.encoding).subregion()
155+
156+
134157
class HtmlTagType(object):
135158
OPEN_TAG = 1
136159
CLOSE_TAG = 2
137160
UNPAIRED_TAG = 3
138161

139162
class HtmlDataFragment(object):
140-
__slots__ = ('start', 'end')
163+
__slots__ = ('start', 'end', 'is_text_content')
141164

142-
def __init__(self, start, end):
165+
def __init__(self, start, end, is_text_content=False):
143166
self.start = start
144167
self.end = end
168+
self.is_text_content = is_text_content
145169

146170
def __str__(self):
147-
return "<HtmlDataFragment [%s:%s]>" % (self.start, self.end)
171+
return "<HtmlDataFragment [%s:%s] is_text_content: %s>" % (self.start, self.end, self.is_text_content)
148172

149173
def __repr__(self):
150174
return str(self)
@@ -191,7 +215,7 @@ def parse_html(text):
191215
end = match.end()
192216

193217
if start > prev_end:
194-
yield HtmlDataFragment(prev_end, start)
218+
yield HtmlDataFragment(prev_end, start, True)
195219

196220
if match.groups()[0] is not None: # comment
197221
yield HtmlDataFragment(start, end)
@@ -203,7 +227,7 @@ def parse_html(text):
203227
prev_end = end
204228
textlen = len(text)
205229
if prev_end < textlen:
206-
yield HtmlDataFragment(prev_end, textlen)
230+
yield HtmlDataFragment(prev_end, textlen, True)
207231

208232
def _parse_script(match):
209233
"""parse a <script>...</script> region matched by _HTML_REGEXP"""

scrapely/tests/samples/samples_htmlpage_0.json

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -211,15 +211,18 @@
211211
},
212212
{
213213
"start": 1073,
214-
"end": 1074
214+
"end": 1074,
215+
"is_text_content": false
215216
},
216217
{
217218
"start": 1074,
218-
"end": 2052
219+
"end": 2052,
220+
"is_text_content": false
219221
},
220222
{
221223
"start": 2052,
222-
"end": 2053
224+
"end": 2053,
225+
"is_text_content": false
223226
},
224227
{
225228
"attributes": {},

scrapely/tests/samples/samples_htmlpage_1.json

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,8 @@
212212
},
213213
{
214214
"start": 1956,
215-
"end": 1979
215+
"end": 1979,
216+
"is_text_content": false
216217
},
217218
{
218219
"start": 1979,
@@ -274,15 +275,18 @@
274275
},
275276
{
276277
"start": 2282,
277-
"end": 2283
278+
"end": 2283,
279+
"is_text_content": false
278280
},
279281
{
280282
"start": 2283,
281-
"end": 2437
283+
"end": 2437,
284+
"is_text_content": false
282285
},
283286
{
284287
"start": 2437,
285-
"end": 2438
288+
"end": 2438,
289+
"is_text_content": false
286290
},
287291
{
288292
"attributes": {},
@@ -306,7 +310,8 @@
306310
},
307311
{
308312
"start": 2482,
309-
"end": 2702
313+
"end": 2702,
314+
"is_text_content": false
310315
},
311316
{
312317
"attributes": {},
@@ -330,7 +335,8 @@
330335
},
331336
{
332337
"start": 2743,
333-
"end": 2851
338+
"end": 2851,
339+
"is_text_content": false
334340
},
335341
{
336342
"attributes": {},
@@ -345,7 +351,8 @@
345351
},
346352
{
347353
"start": 2861,
348-
"end": 2882
354+
"end": 2882,
355+
"is_text_content": false
349356
},
350357
{
351358
"start": 2882,
@@ -986,7 +993,8 @@
986993
},
987994
{
988995
"start": 5346,
989-
"end": 5537
996+
"end": 5537,
997+
"is_text_content": false
990998
},
991999
{
9921000
"attributes": {},
@@ -8851,7 +8859,8 @@
88518859
},
88528860
{
88538861
"start": 30410,
8854-
"end": 30920
8862+
"end": 30920,
8863+
"is_text_content": false
88558864
},
88568865
{
88578866
"attributes": {},
@@ -9551,7 +9560,8 @@
95519560
},
95529561
{
95539562
"start": 33433,
9554-
"end": 33454
9563+
"end": 33454,
9564+
"is_text_content": false
95559565
},
95569566
{
95579567
"attributes": {
@@ -9576,7 +9586,8 @@
95769586
},
95779587
{
95789588
"start": 33669,
9579-
"end": 33689
9589+
"end": 33689,
9590+
"is_text_content": false
95809591
},
95819592
{
95829593
"attributes": {},
@@ -21129,7 +21140,8 @@
2112921140
},
2113021141
{
2113121142
"start": 70112,
21132-
"end": 70136
21143+
"end": 70136,
21144+
"is_text_content": false
2113321145
},
2113421146
{
2113521147
"attributes": {},

scrapely/tests/samples/samples_htmlpage_2.json

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -244,7 +244,8 @@
244244
},
245245
{
246246
"start": 2182,
247-
"end": 2205
247+
"end": 2205,
248+
"is_text_content": false
248249
},
249250
{
250251
"start": 2205,
@@ -306,15 +307,18 @@
306307
},
307308
{
308309
"start": 2508,
309-
"end": 2509
310+
"end": 2509,
311+
"is_text_content": false
310312
},
311313
{
312314
"start": 2509,
313-
"end": 2663
315+
"end": 2663,
316+
"is_text_content": false
314317
},
315318
{
316319
"start": 2663,
317-
"end": 2664
320+
"end": 2664,
321+
"is_text_content": false
318322
},
319323
{
320324
"attributes": {},
@@ -338,7 +342,8 @@
338342
},
339343
{
340344
"start": 2708,
341-
"end": 2928
345+
"end": 2928,
346+
"is_text_content": false
342347
},
343348
{
344349
"attributes": {},
@@ -362,7 +367,8 @@
362367
},
363368
{
364369
"start": 2969,
365-
"end": 3077
370+
"end": 3077,
371+
"is_text_content": false
366372
},
367373
{
368374
"attributes": {},
@@ -377,7 +383,8 @@
377383
},
378384
{
379385
"start": 3087,
380-
"end": 3108
386+
"end": 3108,
387+
"is_text_content": false
381388
},
382389
{
383390
"start": 3108,
@@ -993,7 +1000,8 @@
9931000
},
9941001
{
9951002
"start": 5449,
996-
"end": 5640
1003+
"end": 5640,
1004+
"is_text_content": false
9971005
},
9981006
{
9991007
"attributes": {},
@@ -8828,7 +8836,8 @@
88288836
},
88298837
{
88308838
"start": 30096,
8831-
"end": 30606
8839+
"end": 30606,
8840+
"is_text_content": false
88328841
},
88338842
{
88348843
"attributes": {},
@@ -9501,7 +9510,8 @@
95019510
},
95029511
{
95039512
"start": 32812,
9504-
"end": 32833
9513+
"end": 32833,
9514+
"is_text_content": false
95059515
},
95069516
{
95079517
"attributes": {
@@ -9526,7 +9536,8 @@
95269536
},
95279537
{
95289538
"start": 33044,
9529-
"end": 33064
9539+
"end": 33064,
9540+
"is_text_content": false
95309541
},
95319542
{
95329543
"attributes": {},
@@ -20956,7 +20967,8 @@
2095620967
},
2095720968
{
2095820969
"start": 69651,
20959-
"end": 69675
20970+
"end": 69675,
20971+
"is_text_content": false
2096020972
},
2096120973
{
2096220974
"attributes": {},

scrapely/tests/test_extraction.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -915,6 +915,28 @@
915915
</table>
916916
"""
917917

918+
ANNOTATED_PAGE30 = u"""
919+
<div data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;generated&quot;: false,
920+
&quot;annotations&quot;: {&quot;content&quot;: &quot;phone&quot;}}"><span>029349293</span></div>
921+
"""
922+
923+
EXTRACT_PAGE30a = u"""
924+
<div><span style="font-size:100%">Any text</span></div>
925+
"""
926+
927+
EXTRACT_PAGE30b = u"""
928+
<div><span style="font-size:100%">029847272</span></div>
929+
"""
930+
931+
EXTRACT_PAGE30c = u"""
932+
<div><span><!--item no. 100--></span></div>
933+
"""
934+
935+
EXTRACT_PAGE30d = u"""
936+
<div><span><script>var myvar= 10;</script></span></div>
937+
"""
938+
939+
918940
DEFAULT_DESCRIPTOR = ItemDescriptor('test',
919941
'item test, removes tags from description attribute',
920942
[A('description', 'description field without tags', notags)])
@@ -934,6 +956,9 @@
934956
contains_any_numbers),
935957
])
936958

959+
SAMPLE_DESCRIPTOR3 = ItemDescriptor('test',
960+
'item test',
961+
[A('phone', 'phone number', lambda x: contains_any_numbers(x.text_content))])
937962

938963
# A list of (test name, [templates], page, extractors, expected_result)
939964
TEST_DATA = [
@@ -1178,6 +1203,19 @@
11781203
}
11791204

11801205
),
1206+
('avoid false positives by allowing to extract only from text content', [ANNOTATED_PAGE30], EXTRACT_PAGE30a, SAMPLE_DESCRIPTOR3,
1207+
{}
1208+
),
1209+
('only extract from text content', [ANNOTATED_PAGE30], EXTRACT_PAGE30b, SAMPLE_DESCRIPTOR3,
1210+
{u'phone': [u'029847272']}
1211+
),
1212+
('avoid false positives on comments', [ANNOTATED_PAGE30], EXTRACT_PAGE30c, SAMPLE_DESCRIPTOR3,
1213+
{}
1214+
),
1215+
('avoid false positives on scripts', [ANNOTATED_PAGE30], EXTRACT_PAGE30d, SAMPLE_DESCRIPTOR3,
1216+
{}
1217+
),
1218+
11811219
]
11821220

11831221
class TestIbl(TestCase):

0 commit comments

Comments
 (0)