Skip to content

Commit f2d780e

Browse files
committed
scrapely.tool: add support for non-ascii <text> and <data> arguments
<text> and <data> arguments are parsed by parse_criteria function (it uses shlex and optparse for parsing). Data that is passed to parse_criteria function is extracted from "line" argument of do_<…> methods. This "line" argument is read from self.stdin by cmd.Cmd and passed to do_ methods. In Python 2.x sys.stdin (which is default for cmd.Cmd.stdin) is binary, so "line" is a bytestring; its encoding is self.stdin.encoding. That's why <text> and <data> argument values was previously bytestrings; when passed to other scrapely functions they eventually got implicitly decoded using sys.getdefaultencoding() - this usually leads to UnicodeDecodeError if input text is non-ascii. The fix is to decode these arguments using self.stdin.encoding before passing them to scrapely. This is done after shlex call because shlex doesn't support unicode. Non-ascii "field" arguments are still unsupported.
1 parent 1addba6 commit f2d780e

File tree

1 file changed

+13
-12
lines changed

1 file changed

+13
-12
lines changed

scrapely/tool.py

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def do_t(self, line):
4343
"""t <template> <text> - test selection text"""
4444
template_id, criteria = line.split(' ', 1)
4545
t = self._load_template(template_id)
46-
criteria = parse_criteria(criteria)
46+
criteria = self._parse_criteria(criteria)
4747
tm = TemplateMaker(t)
4848
selection = apply_criteria(criteria, tm)
4949
for n, i in enumerate(selection):
@@ -57,7 +57,7 @@ def do_a(self, line):
5757
"""
5858
template_id, criteria = line.split(' ', 1)
5959
t = self._load_template(template_id)
60-
criteria = parse_criteria(criteria)
60+
criteria = self._parse_criteria(criteria)
6161
tm = TemplateMaker(t)
6262
selection = apply_criteria(criteria, tm)
6363
if criteria.field:
@@ -126,21 +126,22 @@ def _save_templates(self, templates):
126126
with open(self.filename, 'w') as f:
127127
templates = [page_to_dict(t) for t in templates]
128128
return json.dump({'templates': templates}, f)
129-
129+
130+
def _parse_criteria(self, criteria_str):
131+
"""Parse the given criteria string and returns a criteria object"""
132+
p = optparse.OptionParser()
133+
p.add_option('-f', '--field', help='field to annotate')
134+
p.add_option('-n', '--number', type="int", help='number of result to select')
135+
o, a = p.parse_args(shlex.split(criteria_str))
136+
o.text = ' '.join(a).decode(self.stdin.encoding or 'ascii')
137+
return o
138+
139+
130140
def parse_at(ta_line):
131141
p = optparse.OptionParser()
132142
p.add_option('-e', '--encoding', help='page encoding')
133143
return p.parse_args(shlex.split(ta_line))
134144

135-
def parse_criteria(criteria_str):
136-
"""Parse the given criteria string and returns a criteria object"""
137-
p = optparse.OptionParser()
138-
p.add_option('-f', '--field', help='field to annotate')
139-
p.add_option('-n', '--number', type="int", help='number of result to select')
140-
o, a = p.parse_args(shlex.split(criteria_str))
141-
o.text = ' '.join(a)
142-
return o
143-
144145
def apply_criteria(criteria, tm):
145146
"""Apply the given criteria object to the given template"""
146147
func = best_match(criteria.text) if criteria.text else lambda x, y: False

0 commit comments

Comments
 (0)