Merge pull request #20 from scrapy/scraper_refactor

shaneaevans · shaneaevans · commit eb9469449ab7 · 2012-02-17T01:07:07.000-08:00
improve encoding in command line tool
diff --git a/README.rst b/README.rst
@@ -93,20 +93,19 @@ To annotate some fields on the template::
 
     scrapely> a 0 w3lib 1.0 -n 1 -f name
     [new] (name) u'<h1>w3lib 1.0</h1>'
-    scrapely> a 0 Scrapy project -n 1 -f author
-    [new] u'<span>Scrapy project &lt;info at scrapy org&gt;</span>'
+    scrapely> a 0 Scrapy project -n 0 -f author
+    [new] u'<span>Scrapy project</span>'
 
 To list annotations on a template::
 
     scrapely> al 0
     [0-0] (name) u'<h1>w3lib 1.0</h1>'
-    [0-1] (author) u'<span>Scrapy project &lt;info at scrapy org&gt;</span>'
+    [0-1] (author) u'<span>Scrapy project</span>'
 
 To scrape another similar page with the already added templates::
 
     scrapely> s http://pypi.python.org/pypi/Django/1.3
-    [{u'author': [u'Django Software Foundation &lt;foundation at djangoproject com&gt;'],
-      u'name': [u'Django 1.3']}]
+    [{u'author': [u'Django Software Foundation'], u'name': [u'Django 1.3']}]
 
 
 Requirements
diff --git a/scrapely/htmlpage.py b/scrapely/htmlpage.py
@@ -8,22 +8,27 @@
 import re, hashlib, urllib2
 from w3lib.encoding import html_to_unicode
 
-def url_to_page(url, encoding=None):
+def url_to_page(url, encoding=None, default_encoding='utf-8'):
     """Fetch a URL, using python urllib2, and return an HtmlPage object.
 
     The `url` may be a string, or a `urllib2.Request` object. The `encoding`
     argument can be used to force the interpretation of the page encoding.
 
     Redirects are followed, and the `url` property of the returned HtmlPage object
     is the url of the final page redirected to.
+
+    If the encoding of the page is known, it can be passed as a keyword argument. If
+    unspecified, the encoding is guessed using `w3lib.encoding.html_to_unicode`. 
+    `default_encoding` is used if the encoding cannot be determined.
     """
     fh = urllib2.urlopen(url)
     info = fh.info()
     body_str = fh.read()
     # guess content encoding if not specified
     if encoding is None:
         content_type_header = info.getheader("content-encoding")
-        encoding, body = html_to_unicode(content_type_header, body_str)
+        encoding, body = html_to_unicode(content_type_header, body_str, 
+                default_encoding=default_encoding)
     else:
         body = body_str.decode(encoding)
     return HtmlPage(fh.geturl(), headers=info.dict, body=body, encoding=encoding)
diff --git a/scrapely/tool.py b/scrapely/tool.py
@@ -2,7 +2,7 @@
 import sys, os, re, cmd, shlex, json, optparse, json, urllib, pprint
 from cStringIO import StringIO
 
-from scrapely.htmlpage import HtmlPage, page_to_dict
+from scrapely.htmlpage import HtmlPage, page_to_dict, url_to_page
 from scrapely.template import TemplateMaker, best_match
 from scrapely.extraction import InstanceBasedLearningExtractor
 
@@ -17,7 +17,7 @@ def __init__(self, filename, **kw):
     def do_ta(self, line):
         """ta <url> [--encoding ENCODING] - add template"""
         opts, (url,) = parse_at(line)
-        t = get_page(url, opts.encoding)
+        t = url_to_page(url, opts.encoding)
         templates = self._load_templates()
         templates.append(t)
         self._save_templates(templates)
@@ -82,11 +82,12 @@ def do_al(self, template_id):
                 remove_annotation(tm.selected_data(i)))
 
     def do_s(self, url):
-        """s <url> - scrape url (uses encoding from templates)"""
+        """s <url> - scrape url"""
         templates = self._load_templates()
         if assert_or_print(templates, "no templates available"):
             return
-        page = get_page(url, templates[0].encoding)
+        # fall back to the template encoding if none is specified
+        page = url_to_page(url, default_encoding=templates[0].encoding)
         ex = InstanceBasedLearningExtractor((t, None) for t in templates)
         pprint.pprint(ex.extract(page)[0])
 
@@ -126,13 +127,9 @@ def _save_templates(self, templates):
             templates = [page_to_dict(t) for t in templates]
             return json.dump({'templates': templates}, f)
         
-def get_page(url, encoding):
-    body = urllib.urlopen(url).read().decode(encoding)
-    return HtmlPage(url, body=body, encoding=encoding)
-
 def parse_at(ta_line):
     p = optparse.OptionParser()
-    p.add_option('-e', '--encoding', default='utf-8', help='page encoding')
+    p.add_option('-e', '--encoding', help='page encoding')
     return p.parse_args(shlex.split(ta_line))
 
 def parse_criteria(criteria_str):