Merge pull request #19 from scrapy/scraper_refactor

shaneaevans · shaneaevans · commit 63284b7278cc · 2012-02-16T08:10:41.000-08:00
Scraper refactor
diff --git a/scrapely/__init__.py b/scrapely/__init__.py
@@ -4,15 +4,16 @@
 except ImportError:
     import simplejson as json
 
-from scrapely.htmlpage import HtmlPage, page_to_dict
+from scrapely.htmlpage import HtmlPage, page_to_dict, url_to_page
 from scrapely.template import TemplateMaker, best_match
 from scrapely.extraction import InstanceBasedLearningExtractor
 
 class Scraper(object):
 
     def __init__(self, templates=None):
         """Initialize an empty scraper."""
-        self.templates = templates or []
+        self._templates = templates or []
+        self._ex = None
 
     @classmethod
     def fromfile(cls, file):
@@ -21,31 +22,38 @@ def fromfile(cls, file):
         """
         templates = [HtmlPage(**x) for x in json.load(file)['templates']]
         return cls(templates)
-
+    
     def tofile(self, file):
         """Store the scraper into the given file-like object"""
-        tpls = [page_to_dict(x) for x in self.templates]
+        tpls = [page_to_dict(x) for x in self._templates]
         json.dump({'templates': tpls}, file)
+    
+    def add_template(self, template):
+        self._templates.append(template)
+        self._ex = None
 
-    def train(self, url, data, encoding='utf-8'):
+    def train_from_htmlpage(self, htmlpage, data):
         assert data, "Cannot train with empty data"
-        page = self._get_page(url, encoding)
-        tm = TemplateMaker(page)
+        tm = TemplateMaker(htmlpage)
         for field, values in data.items():
             if not hasattr(values, '__iter__'):
                 values = [values]
             for value in values:
                 if isinstance(value, str):
-                    value = value.decode(encoding)
+                    value = value.decode(htmlpage.encoding or 'utf-8')
                 tm.annotate(field, best_match(value))
-        self.templates.append(tm.get_template())
+        self.add_template(tm.get_template())
+
+    def train(self, url, data, encoding=None):
+        page = url_to_page(url, encoding)
+        self.train_from_htmlpage(page, data)
 
-    def scrape(self, url, encoding='utf-8'):
-        page = self._get_page(url, encoding)
-        ex = InstanceBasedLearningExtractor((t, None) for t in self.templates)
-        return ex.extract(page)[0]
+    def scrape(self, url, encoding=None):
+        page = url_to_page(url, encoding)
+        return self.scrape_page(page)
 
-    @staticmethod
-    def _get_page(url, encoding):
-        body = urllib.urlopen(url).read().decode(encoding)
-        return HtmlPage(url, body=body, encoding=encoding)
+    def scrape_page(self, page):
+        if self._ex is None:
+            self._ex = InstanceBasedLearningExtractor((t, None) for t in 
+                    self._templates)
+        return self._ex.extract(page)[0]
diff --git a/scrapely/htmlpage.py b/scrapely/htmlpage.py
@@ -5,8 +5,28 @@
 system. This encapsulates page related information and prevents parsing
 multiple times.
 """
-import re
-import hashlib
+import re, hashlib, urllib2
+from w3lib.encoding import html_to_unicode
+
+def url_to_page(url, encoding=None):
+    """Fetch a URL, using python urllib2, and return an HtmlPage object.
+
+    The `url` may be a string, or a `urllib2.Request` object. The `encoding`
+    argument can be used to force the interpretation of the page encoding.
+
+    Redirects are followed, and the `url` property of the returned HtmlPage object
+    is the url of the final page redirected to.
+    """
+    fh = urllib2.urlopen(url)
+    info = fh.info()
+    body_str = fh.read()
+    # guess content encoding if not specified
+    if encoding is None:
+        content_type_header = info.getheader("content-encoding")
+        encoding, body = html_to_unicode(content_type_header, body_str)
+    else:
+        body = body_str.decode(encoding)
+    return HtmlPage(fh.geturl(), headers=info.dict, body=body, encoding=encoding)
 
 def dict_to_page(jsonpage, body_key='body'):
     """Create an HtmlPage object from a dict object.
diff --git a/scrapely/tests/__init__.py b/scrapely/tests/__init__.py
@@ -1,8 +1,30 @@
 import sys
+from os import path
+from itertools import count
 from unittest import TestSuite, TestLoader, main
 from doctest import DocTestSuite
+from scrapely import json
 
-path = sys.modules[__name__].__path__[0]
+_PATH  = path.abspath(path.dirname(__file__))
+
+def iter_samples(prefix, html_encoding='utf-8', **json_kwargs):
+    """Iterate through (raw_data, extracted_data) for all samples
+    beginning with the specified prefix.
+
+    By convention, these are stored in the samples directory in the 
+    format samples_PREFIX_COUNTER.[html|json]
+    """
+    SAMPLES_FILE_PREFIX = path.join(_PATH, "samples/samples_" + prefix + "_")
+    json_load_kwargs = dict(encoding='utf-8')
+    json_load_kwargs.update(json_kwargs)
+    for i in count():
+        fname = SAMPLES_FILE_PREFIX + str(i)
+        html_page = fname + ".html"
+        if not path.exists(html_page):
+            return
+        html_str = open(html_page, 'rb').read()
+        sample_data = json.load(open(fname + '.json'), **json_load_kwargs)
+        yield html_str.decode(html_encoding), sample_data
 
 UNIT_TESTS = [
     'scrapely.tests.test_extraction',
diff --git a/scrapely/tests/test_htmlpage.py b/scrapely/tests/test_htmlpage.py
@@ -4,13 +4,10 @@
 import os
 from unittest import TestCase
 
-from scrapely import json
-from scrapely.tests import path
+from scrapely.tests import iter_samples
 from scrapely.htmlpage import parse_html, HtmlTag, HtmlDataFragment
 from scrapely.tests.test_htmlpage_data import *
 
-SAMPLES_FILE_PREFIX = os.path.join(path, "samples/samples_htmlpage")
-
 def _encode_element(el):
     """
     jsonize parse element
@@ -84,16 +81,9 @@ def test_parse(self):
         
     def test_site_samples(self):
         """test parse_html from real cases"""
-        count = 0
-        fname = "%s_%d.json" % (SAMPLES_FILE_PREFIX, count)
-        while os.path.exists(fname):
-            source = open("%s_%d.html" % (SAMPLES_FILE_PREFIX, count), "rb").read()
-            source = source.decode('utf-8')
-            parsed = json.loads(open(fname, "rb").read().decode('utf-8'), \
-                    object_hook=_decode_element)
-            self._test_sample(source, parsed, count)
-            count += 1
-            fname = "%s_%d.json" % (SAMPLES_FILE_PREFIX, count)
+        for i, (source, parsed) in enumerate(
+                iter_samples('htmlpage', object_hook=_decode_element)):
+            self._test_sample(source, parsed, i)
  
     def test_bad(self):
         """test parsing of bad html layout"""
diff --git a/scrapely/tests/test_pageparsing.py b/scrapely/tests/test_pageparsing.py
@@ -6,9 +6,8 @@
 from unittest import TestCase
 import numpy
 
-from scrapely import json
 from scrapely.htmlpage import HtmlPage
-from scrapely.tests import path
+from scrapely.tests import iter_samples
 from scrapely.extraction.pageparsing import (
     InstanceLearningParser, TemplatePageParser, ExtractionPageParser)
 from scrapely.extraction.pageobjects import TokenDict, TokenType
@@ -309,13 +308,7 @@ def test_site_pages(self):
         """
         Tests from real pages. More reliable and easy to build for more complicated structures
         """
-        SAMPLES_FILE_PREFIX = os.path.join(path, "samples/samples_pageparsing")
-        count = 0
-        fname = "%s_%d.json" % (SAMPLES_FILE_PREFIX, count)
-        while os.path.exists(fname):
-            source = open("%s_%d.html" % (SAMPLES_FILE_PREFIX, count), "rb").read()
-            source = source.decode('utf-8')
-            annotations = json.loads(open(fname, "rb").read().decode('utf-8'))
+        for source, annotations in iter_samples('pageparsing'):
             template = HtmlPage(body=source)
             parser = TemplatePageParser(TokenDict())
             parser.feed(template)
@@ -328,6 +321,3 @@ def test_site_pages(self):
                     else:
                         self.assertEqual(getattr(annotation, s), test_annotation[s])
             self.assertEqual(annotations, [])
-            count += 1
-            fname = "%s_%d.json" % (SAMPLES_FILE_PREFIX, count)
-
diff --git a/scrapely/tests/test_scraper.py b/scrapely/tests/test_scraper.py
@@ -2,21 +2,35 @@
 from cStringIO import StringIO
 
 from scrapely import Scraper
+from scrapely.htmlpage import HtmlPage
+from scrapely.tests import iter_samples
 
 class ScraperTest(TestCase):
+    
+    def _assert_extracted(self, extracted, expected):
+        # FIXME: this is a very weak test - we should assert the 
+        # extracted data matches, fixing issues that prevent it
+        expect_keys = sorted(expected.keys())
+        found_keys = sorted(extracted[0].keys())
+        self.assertEqual(expect_keys, found_keys)
 
-    def test_train_store_load_scrape(self):
-        url1 = 'http://www.icone.co.uk/lighting-suspension/copper-shade-by-tom-dixon/tom-dixon/tom-dixon/MSS45UKC/'
-        data = {'name': 'Copper Shade by Tom Dixon', 'designer': 'Tom Dixon', 'price': '320'}
-        s = Scraper()
-        s.train(url1, data, encoding='latin1')
+    def test_extraction(self):
 
-        f = StringIO()
-        s.tofile(f)
+        samples_encoding = 'latin1'
+        [(html1, data1), (html2, data2)] = list(iter_samples(
+            'scraper_loadstore', html_encoding=samples_encoding))
+        sc = Scraper()
+        page1 = HtmlPage(body=html1, encoding=samples_encoding)
+        sc.train_from_htmlpage(page1, data1)
 
-        f.seek(0)
-        s = Scraper.fromfile(f)
+        page2 = HtmlPage(body=html2, encoding=samples_encoding)
+        extracted_data = sc.scrape_page(page2)
+        self._assert_extracted(extracted_data, data2)
 
-        url2 = 'http://www.icone.co.uk/lighting-wall-and-ceiling/mesmeri-halo-chrome/artemide/eric-sole/0916024A/'
-        data = s.scrape(url2, encoding='latin1')
-        self.assertEqual(sorted(data[0].keys()), ['designer', 'name', 'price'])
+        # check still works after serialize/deserialize 
+        f = StringIO()
+        sc.tofile(f)
+        f.seek(0)
+        sc = Scraper.fromfile(f)
+        extracted_data = sc.scrape_page(page2)
+        self._assert_extracted(extracted_data, data2)