Skip to content

Commit eb94694

Browse files
committed
Merge pull request #20 from scrapy/scraper_refactor
improve encoding in command line tool
2 parents 63284b7 + 792d653 commit eb94694

File tree

3 files changed

+17
-16
lines changed

3 files changed

+17
-16
lines changed

README.rst

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -93,20 +93,19 @@ To annotate some fields on the template::
9393

9494
scrapely> a 0 w3lib 1.0 -n 1 -f name
9595
[new] (name) u'<h1>w3lib 1.0</h1>'
96-
scrapely> a 0 Scrapy project -n 1 -f author
97-
[new] u'<span>Scrapy project &lt;info at scrapy org&gt;</span>'
96+
scrapely> a 0 Scrapy project -n 0 -f author
97+
[new] u'<span>Scrapy project</span>'
9898

9999
To list annotations on a template::
100100

101101
scrapely> al 0
102102
[0-0] (name) u'<h1>w3lib 1.0</h1>'
103-
[0-1] (author) u'<span>Scrapy project &lt;info at scrapy org&gt;</span>'
103+
[0-1] (author) u'<span>Scrapy project</span>'
104104

105105
To scrape another similar page with the already added templates::
106106

107107
scrapely> s http://pypi.python.org/pypi/Django/1.3
108-
[{u'author': [u'Django Software Foundation &lt;foundation at djangoproject com&gt;'],
109-
u'name': [u'Django 1.3']}]
108+
[{u'author': [u'Django Software Foundation'], u'name': [u'Django 1.3']}]
110109

111110

112111
Requirements

scrapely/htmlpage.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,22 +8,27 @@
88
import re, hashlib, urllib2
99
from w3lib.encoding import html_to_unicode
1010

11-
def url_to_page(url, encoding=None):
11+
def url_to_page(url, encoding=None, default_encoding='utf-8'):
1212
"""Fetch a URL, using python urllib2, and return an HtmlPage object.
1313
1414
The `url` may be a string, or a `urllib2.Request` object. The `encoding`
1515
argument can be used to force the interpretation of the page encoding.
1616
1717
Redirects are followed, and the `url` property of the returned HtmlPage object
1818
is the url of the final page redirected to.
19+
20+
If the encoding of the page is known, it can be passed as a keyword argument. If
21+
unspecified, the encoding is guessed using `w3lib.encoding.html_to_unicode`.
22+
`default_encoding` is used if the encoding cannot be determined.
1923
"""
2024
fh = urllib2.urlopen(url)
2125
info = fh.info()
2226
body_str = fh.read()
2327
# guess content encoding if not specified
2428
if encoding is None:
2529
content_type_header = info.getheader("content-encoding")
26-
encoding, body = html_to_unicode(content_type_header, body_str)
30+
encoding, body = html_to_unicode(content_type_header, body_str,
31+
default_encoding=default_encoding)
2732
else:
2833
body = body_str.decode(encoding)
2934
return HtmlPage(fh.geturl(), headers=info.dict, body=body, encoding=encoding)

scrapely/tool.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import sys, os, re, cmd, shlex, json, optparse, json, urllib, pprint
33
from cStringIO import StringIO
44

5-
from scrapely.htmlpage import HtmlPage, page_to_dict
5+
from scrapely.htmlpage import HtmlPage, page_to_dict, url_to_page
66
from scrapely.template import TemplateMaker, best_match
77
from scrapely.extraction import InstanceBasedLearningExtractor
88

@@ -17,7 +17,7 @@ def __init__(self, filename, **kw):
1717
def do_ta(self, line):
1818
"""ta <url> [--encoding ENCODING] - add template"""
1919
opts, (url,) = parse_at(line)
20-
t = get_page(url, opts.encoding)
20+
t = url_to_page(url, opts.encoding)
2121
templates = self._load_templates()
2222
templates.append(t)
2323
self._save_templates(templates)
@@ -82,11 +82,12 @@ def do_al(self, template_id):
8282
remove_annotation(tm.selected_data(i)))
8383

8484
def do_s(self, url):
85-
"""s <url> - scrape url (uses encoding from templates)"""
85+
"""s <url> - scrape url"""
8686
templates = self._load_templates()
8787
if assert_or_print(templates, "no templates available"):
8888
return
89-
page = get_page(url, templates[0].encoding)
89+
# fall back to the template encoding if none is specified
90+
page = url_to_page(url, default_encoding=templates[0].encoding)
9091
ex = InstanceBasedLearningExtractor((t, None) for t in templates)
9192
pprint.pprint(ex.extract(page)[0])
9293

@@ -126,13 +127,9 @@ def _save_templates(self, templates):
126127
templates = [page_to_dict(t) for t in templates]
127128
return json.dump({'templates': templates}, f)
128129

129-
def get_page(url, encoding):
130-
body = urllib.urlopen(url).read().decode(encoding)
131-
return HtmlPage(url, body=body, encoding=encoding)
132-
133130
def parse_at(ta_line):
134131
p = optparse.OptionParser()
135-
p.add_option('-e', '--encoding', default='utf-8', help='page encoding')
132+
p.add_option('-e', '--encoding', help='page encoding')
136133
return p.parse_args(shlex.split(ta_line))
137134

138135
def parse_criteria(criteria_str):

0 commit comments

Comments
 (0)