Skip to content

Commit 63284b7

Browse files
committed
Merge pull request #19 from scrapy/scraper_refactor
Scraper refactor
2 parents 741340a + 2f8e9d4 commit 63284b7

File tree

6 files changed

+102
-58
lines changed

6 files changed

+102
-58
lines changed

scrapely/__init__.py

Lines changed: 25 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,16 @@
44
except ImportError:
55
import simplejson as json
66

7-
from scrapely.htmlpage import HtmlPage, page_to_dict
7+
from scrapely.htmlpage import HtmlPage, page_to_dict, url_to_page
88
from scrapely.template import TemplateMaker, best_match
99
from scrapely.extraction import InstanceBasedLearningExtractor
1010

1111
class Scraper(object):
1212

1313
def __init__(self, templates=None):
1414
"""Initialize an empty scraper."""
15-
self.templates = templates or []
15+
self._templates = templates or []
16+
self._ex = None
1617

1718
@classmethod
1819
def fromfile(cls, file):
@@ -21,31 +22,38 @@ def fromfile(cls, file):
2122
"""
2223
templates = [HtmlPage(**x) for x in json.load(file)['templates']]
2324
return cls(templates)
24-
25+
2526
def tofile(self, file):
2627
"""Store the scraper into the given file-like object"""
27-
tpls = [page_to_dict(x) for x in self.templates]
28+
tpls = [page_to_dict(x) for x in self._templates]
2829
json.dump({'templates': tpls}, file)
30+
31+
def add_template(self, template):
32+
self._templates.append(template)
33+
self._ex = None
2934

30-
def train(self, url, data, encoding='utf-8'):
35+
def train_from_htmlpage(self, htmlpage, data):
3136
assert data, "Cannot train with empty data"
32-
page = self._get_page(url, encoding)
33-
tm = TemplateMaker(page)
37+
tm = TemplateMaker(htmlpage)
3438
for field, values in data.items():
3539
if not hasattr(values, '__iter__'):
3640
values = [values]
3741
for value in values:
3842
if isinstance(value, str):
39-
value = value.decode(encoding)
43+
value = value.decode(htmlpage.encoding or 'utf-8')
4044
tm.annotate(field, best_match(value))
41-
self.templates.append(tm.get_template())
45+
self.add_template(tm.get_template())
46+
47+
def train(self, url, data, encoding=None):
48+
page = url_to_page(url, encoding)
49+
self.train_from_htmlpage(page, data)
4250

43-
def scrape(self, url, encoding='utf-8'):
44-
page = self._get_page(url, encoding)
45-
ex = InstanceBasedLearningExtractor((t, None) for t in self.templates)
46-
return ex.extract(page)[0]
51+
def scrape(self, url, encoding=None):
52+
page = url_to_page(url, encoding)
53+
return self.scrape_page(page)
4754

48-
@staticmethod
49-
def _get_page(url, encoding):
50-
body = urllib.urlopen(url).read().decode(encoding)
51-
return HtmlPage(url, body=body, encoding=encoding)
55+
def scrape_page(self, page):
56+
if self._ex is None:
57+
self._ex = InstanceBasedLearningExtractor((t, None) for t in
58+
self._templates)
59+
return self._ex.extract(page)[0]

scrapely/htmlpage.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,28 @@
55
system. This encapsulates page related information and prevents parsing
66
multiple times.
77
"""
8-
import re
9-
import hashlib
8+
import re, hashlib, urllib2
9+
from w3lib.encoding import html_to_unicode
10+
11+
def url_to_page(url, encoding=None):
12+
"""Fetch a URL, using python urllib2, and return an HtmlPage object.
13+
14+
The `url` may be a string, or a `urllib2.Request` object. The `encoding`
15+
argument can be used to force the interpretation of the page encoding.
16+
17+
Redirects are followed, and the `url` property of the returned HtmlPage object
18+
is the url of the final page redirected to.
19+
"""
20+
fh = urllib2.urlopen(url)
21+
info = fh.info()
22+
body_str = fh.read()
23+
# guess content encoding if not specified
24+
if encoding is None:
25+
content_type_header = info.getheader("content-encoding")
26+
encoding, body = html_to_unicode(content_type_header, body_str)
27+
else:
28+
body = body_str.decode(encoding)
29+
return HtmlPage(fh.geturl(), headers=info.dict, body=body, encoding=encoding)
1030

1131
def dict_to_page(jsonpage, body_key='body'):
1232
"""Create an HtmlPage object from a dict object.

scrapely/tests/__init__.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,30 @@
11
import sys
2+
from os import path
3+
from itertools import count
24
from unittest import TestSuite, TestLoader, main
35
from doctest import DocTestSuite
6+
from scrapely import json
47

5-
path = sys.modules[__name__].__path__[0]
8+
_PATH = path.abspath(path.dirname(__file__))
9+
10+
def iter_samples(prefix, html_encoding='utf-8', **json_kwargs):
11+
"""Iterate through (raw_data, extracted_data) for all samples
12+
beginning with the specified prefix.
13+
14+
By convention, these are stored in the samples directory in the
15+
format samples_PREFIX_COUNTER.[html|json]
16+
"""
17+
SAMPLES_FILE_PREFIX = path.join(_PATH, "samples/samples_" + prefix + "_")
18+
json_load_kwargs = dict(encoding='utf-8')
19+
json_load_kwargs.update(json_kwargs)
20+
for i in count():
21+
fname = SAMPLES_FILE_PREFIX + str(i)
22+
html_page = fname + ".html"
23+
if not path.exists(html_page):
24+
return
25+
html_str = open(html_page, 'rb').read()
26+
sample_data = json.load(open(fname + '.json'), **json_load_kwargs)
27+
yield html_str.decode(html_encoding), sample_data
628

729
UNIT_TESTS = [
830
'scrapely.tests.test_extraction',

scrapely/tests/test_htmlpage.py

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,10 @@
44
import os
55
from unittest import TestCase
66

7-
from scrapely import json
8-
from scrapely.tests import path
7+
from scrapely.tests import iter_samples
98
from scrapely.htmlpage import parse_html, HtmlTag, HtmlDataFragment
109
from scrapely.tests.test_htmlpage_data import *
1110

12-
SAMPLES_FILE_PREFIX = os.path.join(path, "samples/samples_htmlpage")
13-
1411
def _encode_element(el):
1512
"""
1613
jsonize parse element
@@ -84,16 +81,9 @@ def test_parse(self):
8481

8582
def test_site_samples(self):
8683
"""test parse_html from real cases"""
87-
count = 0
88-
fname = "%s_%d.json" % (SAMPLES_FILE_PREFIX, count)
89-
while os.path.exists(fname):
90-
source = open("%s_%d.html" % (SAMPLES_FILE_PREFIX, count), "rb").read()
91-
source = source.decode('utf-8')
92-
parsed = json.loads(open(fname, "rb").read().decode('utf-8'), \
93-
object_hook=_decode_element)
94-
self._test_sample(source, parsed, count)
95-
count += 1
96-
fname = "%s_%d.json" % (SAMPLES_FILE_PREFIX, count)
84+
for i, (source, parsed) in enumerate(
85+
iter_samples('htmlpage', object_hook=_decode_element)):
86+
self._test_sample(source, parsed, i)
9787

9888
def test_bad(self):
9989
"""test parsing of bad html layout"""

scrapely/tests/test_pageparsing.py

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,8 @@
66
from unittest import TestCase
77
import numpy
88

9-
from scrapely import json
109
from scrapely.htmlpage import HtmlPage
11-
from scrapely.tests import path
10+
from scrapely.tests import iter_samples
1211
from scrapely.extraction.pageparsing import (
1312
InstanceLearningParser, TemplatePageParser, ExtractionPageParser)
1413
from scrapely.extraction.pageobjects import TokenDict, TokenType
@@ -309,13 +308,7 @@ def test_site_pages(self):
309308
"""
310309
Tests from real pages. More reliable and easy to build for more complicated structures
311310
"""
312-
SAMPLES_FILE_PREFIX = os.path.join(path, "samples/samples_pageparsing")
313-
count = 0
314-
fname = "%s_%d.json" % (SAMPLES_FILE_PREFIX, count)
315-
while os.path.exists(fname):
316-
source = open("%s_%d.html" % (SAMPLES_FILE_PREFIX, count), "rb").read()
317-
source = source.decode('utf-8')
318-
annotations = json.loads(open(fname, "rb").read().decode('utf-8'))
311+
for source, annotations in iter_samples('pageparsing'):
319312
template = HtmlPage(body=source)
320313
parser = TemplatePageParser(TokenDict())
321314
parser.feed(template)
@@ -328,6 +321,3 @@ def test_site_pages(self):
328321
else:
329322
self.assertEqual(getattr(annotation, s), test_annotation[s])
330323
self.assertEqual(annotations, [])
331-
count += 1
332-
fname = "%s_%d.json" % (SAMPLES_FILE_PREFIX, count)
333-

scrapely/tests/test_scraper.py

Lines changed: 26 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,21 +2,35 @@
22
from cStringIO import StringIO
33

44
from scrapely import Scraper
5+
from scrapely.htmlpage import HtmlPage
6+
from scrapely.tests import iter_samples
57

68
class ScraperTest(TestCase):
9+
10+
def _assert_extracted(self, extracted, expected):
11+
# FIXME: this is a very weak test - we should assert the
12+
# extracted data matches, fixing issues that prevent it
13+
expect_keys = sorted(expected.keys())
14+
found_keys = sorted(extracted[0].keys())
15+
self.assertEqual(expect_keys, found_keys)
716

8-
def test_train_store_load_scrape(self):
9-
url1 = 'http://www.icone.co.uk/lighting-suspension/copper-shade-by-tom-dixon/tom-dixon/tom-dixon/MSS45UKC/'
10-
data = {'name': 'Copper Shade by Tom Dixon', 'designer': 'Tom Dixon', 'price': '320'}
11-
s = Scraper()
12-
s.train(url1, data, encoding='latin1')
17+
def test_extraction(self):
1318

14-
f = StringIO()
15-
s.tofile(f)
19+
samples_encoding = 'latin1'
20+
[(html1, data1), (html2, data2)] = list(iter_samples(
21+
'scraper_loadstore', html_encoding=samples_encoding))
22+
sc = Scraper()
23+
page1 = HtmlPage(body=html1, encoding=samples_encoding)
24+
sc.train_from_htmlpage(page1, data1)
1625

17-
f.seek(0)
18-
s = Scraper.fromfile(f)
26+
page2 = HtmlPage(body=html2, encoding=samples_encoding)
27+
extracted_data = sc.scrape_page(page2)
28+
self._assert_extracted(extracted_data, data2)
1929

20-
url2 = 'http://www.icone.co.uk/lighting-wall-and-ceiling/mesmeri-halo-chrome/artemide/eric-sole/0916024A/'
21-
data = s.scrape(url2, encoding='latin1')
22-
self.assertEqual(sorted(data[0].keys()), ['designer', 'name', 'price'])
30+
# check still works after serialize/deserialize
31+
f = StringIO()
32+
sc.tofile(f)
33+
f.seek(0)
34+
sc = Scraper.fromfile(f)
35+
extracted_data = sc.scrape_page(page2)
36+
self._assert_extracted(extracted_data, data2)

0 commit comments

Comments
 (0)